\documentclass[accepted]{uai2025}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

\usepackage{hyperref} 


\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{lipsum}		% Can be removed after putting your text content
\usepackage{graphicx}
\usepackage{natbib}
\usepackage{doi}
\usepackage{mathtools}
\usepackage{xspace}	
\usepackage{amsthm}
\usepackage{setspace}
\usepackage{comment}
\usepackage{amsmath,amssymb}
\usepackage{tablefootnote}
\usepackage{array, longtable}
\usepackage{bbm}
\usepackage{xcolor}
\usepackage{epigraph}
\usepackage{enumitem}
\usepackage{adjustbox}
\usepackage{graphicx}
\usepackage{colortbl}
\usepackage{multirow}
\usepackage{array}
\usepackage{hyperref}
\usepackage{footnotehyper}
\usepackage{fnbreak}
\usepackage{enumitem}
\usepackage{threeparttable}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{tabularx}
\usepackage{cleveref}
% \usepackage{stix}
\usepackage{booktabs} % Для \toprule, \midrule, \bottomrule
\usepackage{siunitx} 
\interfootnotelinepenalty=10000 % предотвращает перенос сноски на другую страницу


\definecolor{maroon}{cmyk}{0,0.87,0.68,0.32}
\definecolor{yellow}{cmyk}{0,0,1,0}

\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
	
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newtheorem{example}{Example}
\newtheorem{remark}{Remark}

\newcommand{\blue}[1]{{\color{blue}#1}}
\usepackage{tcolorbox}
\usepackage{pifont}
\definecolor{mydarkgreen}{RGB}{39,130,67}
\definecolor{mydarkred}{RGB}{192,47,25}
\definecolor{mypurple}{RGB}{250, 150, 250}
\newcommand{\green}{\color{mydarkgreen}}
\newcommand{\red}{\color{mydarkred}}
\newcommand{\cmark}{{\green\ding{51}}}
\newcommand{\xmark}{{\red\ding{55}}}

\newlist{assumlist}{enumerate}{1}
\setlist[assumlist,1]{
    label=(\alph*),
    ref=\theassumption(\alph*),
    align=left,
    leftmargin=0.4cm
}

% REDEFINING COMMANDS
\renewcommand{\leq}{\leqslant}
\newcommand{\E}{\mathbb{E}}




\usepackage{xargs}                      % Use more than one optional parameter in a new commands
 
\usepackage[textsize=tiny]{todonotes}
\newcommandx{\mt}[2][1=]{\todo[linecolor=red,backgroundcolor=red!25,bordercolor=red,#1]{#2}}
\newcommandx{\mti}[2][1=]{\todo[inline,linecolor=red,backgroundcolor=red!25,bordercolor=red,#1]{#2}}


% Therefore, a short form for the running title is supplied here:
\title{When Extragradient Meets PAGE: \\Bridging Two Giants to Boost Variational Inequalities}

% Add authors
\author[1,2]{\href{mailto:<molodtsov.gl@phystech.edu>?Subject=Your UAI 2025 paper}{Gleb Molodtsov}{}}
\author[1]{Valery Parfenov}
\author[1]{Egor Petrov}
\author[1]{Evseev Grigoriy}
\author[1,2]{Daniil Medyakov}
\author[2,1,3]{Aleksandr Beznosikov}
% Add affiliations after the authors
\affil[1]{%
    Moscow Institute of Physics and Technology
}
\affil[2]{%
    Ivannikov Institute for System Programming of the RAS
}
\affil[3]{%
    Innopolis University
  }

\begin{document}

\maketitle

\begin{abstract}
Variational inequalities (VIs) have emerged as a universal framework for solving a wide range of problems. A broad spectrum of applications includes optimization, equilibrium analysis, reinforcement learning, and the rapidly evolving field of generative adversarial networks (GANs). Stochastic methods have proven to be powerful tools for addressing such problems, but they often suffer from irreducible variance, necessitating the development of variance reduction techniques. Among these, \textsc{SARAH}-based algorithms have demonstrated remarkable practical effectiveness. In this work, we propose a new stochastic variance reduced algorithm for solving stochastic variational inequalities.
We push the boundaries of existing methodologies by leveraging \textsc{PAGE} method to solve VIs. Unlike prior studies that lacked theoretical guarantees under general assumptions, we establish rigorous convergence rates, thus closing a crucial gap in the literature. Our contributions extend both theoretical understanding and practical advancements in solving variational inequalities.
To substantiate our claims, we conduct extensive experiments across diverse benchmarks, including a widely studied denoising task. The results consistently showcase the superior efficiency of our approach, underscoring its potential for real-world applications.
\end{abstract}

\section{Introduction}\label{sec:introduction}
Variational inequalities (VIs) have been a cornerstone of mathematical research for a long time, offering an approach to solving a wide range of problems. With the pioneering work \citep{browder1965nonexpansive}, and since then, they have become an indispensable tool. We consider the VI problem  by seeking a solution $z^* \in \mathcal{Z}$ that satisfies the following condition:
\begin{align}\label{eq:vi_setting}
    \forall z \in \mathcal{Z} \hookrightarrow  
    \langle F(z^*), z - z^*\rangle \geqslant 0,
\end{align}
where \( F \) is a monotone operator.
Variational inequalities offer a versatile framework for tackling various mathematical challenges, including minimization problems, saddle and fixed point problems \citep{stampacchia1964formes, facchinei2003finite, kinderlehrer2000introduction}. To build intuition, we present several illustrative examples.

\begin{example}[Convex optimization]\label{ex:convopt}
    Consider the optimization problem:
    \begin{equation}\label{eq:convregprob}
        \underset{z\in\mathbb R^d}{\min} \left[f(z) \right].
    \end{equation}
    Here, $f$ represents a smooth data fitting term. In this scenario, let $F(z) = \nabla f(z)$. Thus, if $f$ is convex, the optimization problem \eqref{eq:convregprob} can be reformulated within the variational inequality framework.
\end{example}
One of the main reasons for the widespread use of VIs is that many non-smooth optimization problems can be reformulated as saddle point problems, significantly improving solution efficiency \citep{nesterov2005smooth, nemirovski2004prox, chambolle2011first, esser2010general}.
\begin{example}[Convex-concave saddle points]\label{ex:convconcsaddle}
    Now, consider the convex-concave saddle point problem:
    \begin{equation}\label{eq:convconcsaddle}
        \underset{x\in \mathbb R^{d_x}}{\min}\underset{y\in \mathbb R^{d_y}}{\max} \left[f(x,y)\right].
    \end{equation}
    In this setting, $f$ serves the same role as in Example \ref{ex:convopt}. Define $F(z) = F(x, y) = \left[\nabla_x f(x, y), -\nabla_y f(x, y)\right]$. Thus, if $f$ is smooth and convex-concave, this establishes the connection between the saddle point formulation \eqref{eq:convconcsaddle} and variational inequalities.
\end{example}
The investigation of minimization problems is frequently conducted independently of VIs. However, the study of saddle point problems is closely intertwined with VIs, as these two areas share a strong theoretical and practical connection. 
\begin{example} (Fixed points). Consider the fixed point problem:
\begin{equation} \label{eq:fixed_point}\text{Find } z^* \in \mathbb{R}^d \text{~~such that~~}T\left(z^*\right)=z^*,
\end{equation}
where $T: \mathbb{R}^d \rightarrow \mathbb{R}^d$ is an operator. With $F(z)=z-T(z)$, it can be proved that $z^* \in \mathbb{R}^d$ is a solution for \eqref{eq:vi_setting} if and only if $F\left(z^*\right)=0$, i.e. $z^* \in \mathbb{R}^d$ is a solution for \eqref{eq:fixed_point}.
\end{example}

Additionally, recent research has established important connections between VIs and fields such as reinforcement learning \citep{omidshafiei2017deep, jin2020efficiently}, adversarial training \citep{madry2017towards}, and generative adversarial networks (GANs) \citep{goodfellow2014generative}. In particular, in-depth analysis of variational inequalities provides theoretical insights and practical guidance for improving GAN training methods \citep{daskalakis2017training, gidel2018variational, mertikopoulos2018optimistic, chavdarova2019reducing, liang2019interaction, peng2020training}.

Beyond these modern applications, VIs also play a crucial role in classical problems such as clustering \citep{xu2004maximum}, matrix factorization \citep{bach2008convex}, image denoising \citep{esser2010general, chambolle2011first}, robust optimization \citep{ben2009robust}, economic modeling, game theory \citep{von1953theory}, and optimal control \citep{facchinei2003finite}.

Despite their broad applicability, solving variational inequalities presents significant challenges. Traditional optimization techniques, such as the gradient method, often fail in this context, both in terms of efficiency and theoretical convergence guarantees \citep{harker1990finite, beznosikov2023smooth}. Among the many approaches developed for solving VIs, the \textsc{Extragradient} method \citep{korpelevich1976extragradient, mokhtari2020unified} has proven to be one of the most effective.

Recent advances in machine learning and data science introduce additional complexities. The growing size of datasets and increasing model complexity demand computationally efficient algorithms \citep{bottou2010large, dean2012large}.
A fundamental optimization problem underlying many machine learning tasks is Empirical Risk Minimization (ERM). In the context of distributed systems, where data is spread across multiple devices, the ERM problem is commonly formulated as:
% strategy to address this issue is to decompose the operator $F$ into a finite sum
\begin{align}
\label{eq:setting_dist}
\underset{z \in \mathbb{R}^d}{\min} \left[f(z) = \frac{1}{n}\sum\limits_{i = 1}^n \mathbb E_{\xi_i\sim \mathcal{D}_i}\left[f_{\xi_i} (z)\right]\right],
\end{align}
where $\mathcal{D}_i$ is an unknown distribution of the training sample data on the $i$-th device.
A particularly important and widely studied case arises when all nodes share the same underlying data distribution, i.e., $\mathcal{D}_i = \mathcal{D}$ for all $i$. In this setting, the objective \eqref{eq:setting_dist} reduces to a finite-sum optimization problem:
\begin{equation}\label{eq:finite-sum}
    f(z) = \frac{1}{n}\sum\limits_{i=1}^n f_i(z),
\end{equation}
where each $f_i(z) := \mathbb{E}_{\xi \sim \mathcal{D}} [f_\xi(z)]$ corresponds to the expected loss over a local mini-batch or a single sample, assuming access to identical distributions across nodes.

Moreover, this finite-sum function can be reformulated as an adversarial training problem:  
\begin{equation} \label{problem_adversarial}
\begin{aligned}
\min_{w \in \mathbb{R}^d} & \max_{\|r_i\| \leqslant D}  
\Bigg[
\frac{1}{N} \sum \nolimits_{i=1}^N 
\left( w^T(x_i+r_i) - y_i \right)^2  \\
&~~~~~~~~~~~~~~~~~~~ \quad\quad\quad + \frac{\lambda}{2} \|w\|^2  
- \frac{\beta}{2} \|r\|^2
\Bigg],
\end{aligned}
\end{equation}
where the samples correspond to features $x_i$ and targets $y_i$. 
This reformulation enables efficient large-scale problem solving.


Stochastic algorithms are particularly well-suited for handling such problems as \eqref{eq:finite-sum}. Instead of computing the full operator value at each iteration, stochastic methods randomly sample $F_i$. The stochastic \textsc{Extragradient} method \citep{juditsky2011solving} follows this principle by selecting independent random indices $i_t, j_t$ at iteration $t$ and performing the following updates:
\begin{equation}\label{eq:egstep}
    \begin{aligned}
        z^{t+\frac{1}{2}} &= z^t - \gamma F_{i_t} (z^t),\\
        z^{t+1} &= z^t - \gamma F_{j_t} (z^{t + \frac{1}{2}}).
    \end{aligned}
\end{equation}

    This method extends the classical \textsc{SGD} approach \citep{robbins1951stochastic} by incorporating an additional step to improve stability. However, it suffers from high variance in stochastic operator estimates, limiting its convergence to a neighborhood of the optimal solution rather than the exact solution itself \citep{juditsky2011solving, mishchenko2020revisiting}. This issue also affects classical \textsc{SGD} \citep{bottou2009curiously, moulines2011non, gower2020variance}. The intuition behind this problem can be easily extracted from the example with a setup involving heterogeneous data, where near the optimal point $\nabla f(z^*) = \frac{1}{n}\sum_{i=1}^{n}\nabla f_i(z^*) \rightarrow 0$, while some $\nabla f_i(z^*)$ can retain finite values. As a result, in the optimum region, \textsc{SGD} will take large steps, preventing it from reaching the optimum.

A major breakthrough in addressing this limitation was the introduction of variance reduction (VR) techniques, originally developed for finite-sum minimization \citep{johnson2013accelerating}. At each iteration, an index $i_t$ is selected along with a reference point $\omega^t$, which is periodically updated or chosen probabilistically \citep{kovalev2020don}. In the context of convex optimization, the variance-reduced gradient at $z^{t+\frac{1}{2}}$ is given by:
\begin{equation}\label{eq:svrg_update_rool}
    g (z^{t}) = \nabla f_{i_t} (z^{t+\frac{1}{2}}) - \nabla f_{i_t} (\omega^t) + \nabla f (\omega^t).
\end{equation}

Variance reduction techniques construct more accurate gradient estimators over time, enabling the use of larger step sizes and accelerating convergence. 

In addition to the widely used \textsc{SVRG} \citep{johnson2013accelerating}, related methods include \textsc{SAG} \citep{roux2012stochastic, schmidt2017minimizing}, \textsc{SAGA} \citep{defazio2014saga, qian2019saga}, and \textsc{Finito} \citep{defazio2014finito}.
However, for both convex and non-convex smooth minimization problems, the best guarantees of convergence are given by other variance-reduced technique SARAH \citep{nguyen2017sarah, hu2019efficient} (and its modifications: \textsc{SPIDER} \citep{fang2018spider}, STORM \citep{cutkosky2019momentum}). Notably, loopless version \textsc{PAGE} \citep{li2021page} has garnered significant interest due to its ability to provide improved convergence guarantees through probabilistic reference point updates.

\textsc{SARAH} technique rejects memorizing all components of the reference gradient and deals with the biased gradient estimator in the inner loop:
\begin{align}\label{eq:sarah_update_rool}
    g^{t} = \nabla f_{i_t} (z^{t + \frac{1}{2}}) - \nabla f_{i_t} (z^{t - \frac{1}{2}}) + g^{t-1}.
\end{align}
Biasedness complicates the theoretical analysis. At the same time, such an update rule leads to smoother changes in the gradient estimator $g$ from iteration to iteration, lower memory costs, and demonstrates better practical performance. Returning to the example with heterogeneous data, this time the difference $\nabla f_{i_t} (z^{t}) - \nabla f_{i_t} (z^{t-1})$ is going to be small for small steps. This fact allows remain the scale of $g^t$ after its initialization with original gradient in the outer loop. Therefore, the issue with large gradient estimators near the optimum is resolved. Additionally, provided demonstration outlines the practical difference of the \eqref{eq:sarah_update_rool} and \eqref{eq:svrg_update_rool} update rules. Indeed, \eqref{eq:sarah_update_rool} utilizes the gradient difference in consecutive points, while \eqref{eq:svrg_update_rool} considers the difference between the current and reference points. This provides an additional boost to \eqref{eq:sarah_update_rool}.

The probabilistic approach simplifies the theoretical analysis achieving the best convergence guarantees. Particularly, we draw attention to the iteration of \textsc{PAGE}, which provides the intuition behind our algorithm:
\begin{align*}
    g^t = \begin{cases}
        \nabla{f} (z^t) &\hspace{-2mm},\textit{with prob.}~p\\
        \nabla f_{i_t} (z^t) - \nabla f_{i_t} (z^{t-1}) + g^{t-1} &\hspace{-2mm},\textit{with prob.}~1-p.
    \end{cases}
\end{align*}

Meanwhile, current research continues to explore the application of variance reduction techniques for solving variational inequalities. Although most methods in this area are based on \textsc{SVRG} \citep{alacaoglu2021stochastic, medyakovshuffling}, the more practically beneficial \textsc{SARAH} method has received limited attention, with only a few studies examining its application \citep{beznosikov2023sarah}. 
Our work bridges this gap by proposing the use of such variance reduction technique in the loopless version for variational inequalities under broader assumptions of Lipschitz continuity and monotonicity.

\subsection*{Brief literature review} 

% A wide range of methods has been developed to address variational inequality problems. 
$\bullet$ \textbf{Deterministic approaches for solving VIs.} As previously noted, the \textsc{Extragradient} method \citep{korpelevich1976extragradient} is a classical deterministic approach for solving the problem \eqref{eq:vi_setting} in the Euclidean setting. Building on this, the \textsc{Mirror-prox} method \citep{nemirovski2004prox} was introduced, incorporating Bregman divergence to extend the framework to non-Euclidean geometries. In addition to these, several other deterministic methods have been proposed for solving VIs, including \textsc{forward-backward-forward (FBF)} \citep{tseng2000modified}, \textsc{Dual extrapolation} \citep{nesterov2007dual}, \textsc{reflected gradient} \citep{malitsky2015projected}, and \textsc{forward-reflected-backward (FoRB)} \citep{malitsky2020forward}.

$\bullet$ \textbf{Stochastic methods for VIs.} The application of various stochastic methods for solving variational inequalities and saddle point problems has been the subject of extensive research. The first stochastic versions of algorithms for solving variational inequalities were proposed by \citep{juditsky2011solving}. The idea was further developed in \citep{gidel2018variational, hsieh2019convergence, mishchenko2020revisiting, hsieh2020explore, gorbunov2022stochastic, beznosikov2023smooth, beznosikov2024first, solodkin2024methods}. Subsequently, researchers employed variance reduction techniques to mitigate the inherent variance in these stochastic methods. Specifically, \citep{palaniappan2016stochastic} explored a stochastic \textsc{gradient method} with VR, combining \textsc{SVRG} with Catalyst acceleration. 

The combination of these techniques with methods traditionally used for variational inequalities appeared in \citep{chavdarova2019reducing} who integrated \textsc{Extragradient} with \textsc{SVRG}, leveraging variance reduction to achieve improved convergence rates. The aforementioned variance reduction technique has also been explored later \citep{Yura2021, alacaoglu2021stochastic, kovalev2022optimalvi, beznosikov2022unified}. 

Although most of the methods were based on the \textsc{SVRG} approach, some studies focused on analyzing the \textsc{SARAH} method, which is more appealing from a practical standpoint for minimization problems.  
Thus, \citep{chen2022faster} proposed \textsc{SPIDER-GDA}, achieving a stochastic first-order oracle complexity of $\mathcal{O}\left(\left(n+\sqrt{n} \kappa_x \kappa_y^2\right) \log (1 / \epsilon)\right)$ under two-sided conditions ($\kappa_x = \nicefrac{L}{\mu_x}$, $\kappa_y = \nicefrac{L}{\mu_y}$). 
The given estimate has a significant drawback: it depends cubically on $\nicefrac{L}{\mu}$. In reality, while batch size parameters $n$ can be dynamically adjusted to influence convergence speed, the problem parameters remain fixed. Consequently, despite potential gains from adjusting $n$, the overall estimate typically presents a much worse scenario on average. Later, \citep{beznosikov2023sarah} presented results for the SARAH method with objective functions under a cocoercivity condition on the operator. However, the given assumption is a more stringent analogue of the Lipschitz continuity condition and does not hold even for training a neural network with two convolutional layers \citep{cybenko1989approximation}. A comparison of these assumptions for variational inequalities is provided in \citep{loizou2021stochastic}. In contrast, our study offers an analysis based on more general assumptions.
% 


\subsection*{Contributions}
Our main contributions are highlighted here. 
\begin{itemize}
    \item \textit{Adaptation of \textsc{PAGE} for Variational Inequalities.} We present an application of the \textsc{PAGE} method, leveraging its practically beneficial variance reduction technique for solving variational inequalities.
    \item \textit{Convergence Estimates under General Assumptions.} We provide theoretical convergence estimates for our method under more general assumptions on the operator and problem conditions (Lipschitz constant), surpassing previous studies in this area.
    \item \textit{Comprehensive Experimental Validation.} Extensive experiments demonstrate the superiority of applying \textsc{PAGE} to \textsc{Extragradient} methods over their vanilla versions or its previous combinations with variance reduction technique. To validate our approach, we conducted the following experiments:  
\begin{enumerate}[leftmargin=0pt]
    \item Training ResNet-18 on CIFAR-10 for a multi-class classification task.  
    \item Image denoising as a practical application of saddle-point methods.  
    \item Solving toy bilinear tasks to analyze performance in controlled settings.  
    \item Adversarial training to highlight robustness and efficiency. 
\end{enumerate}
\end{itemize}


\subsection*{Setup}
\textbf{Notation.} In this paper, we use $\langle x,y  \rangle \vcentcolon= \sum\nolimits_{i=1}^n x_i y_i$ to denote the standard inner product of $x,y\in \mathbb{R}^d$, where $x_i$ corresponds to the $i$-th component of $x$ in the standard basis of $\mathbb{R}^d$. It induces the $\ell_2$-norm in $\mathbb{R}^d$ in the following way: $\|x\| \vcentcolon=\|x\|_2=\sqrt{\langle x, x \rangle}$.

Recall that we consider the problem \eqref{eq:vi_setting}, where the operator $F$ has the form \eqref{eq:finite-sum}. Additionally, we present a list of assumptions within which we obtain the main statements.
\begin{assumption}\label{ass:lip} \textbf{(Lipschitzness.)}  
The operator $F$ has a stochastic oracle $F_{i}$ that is unbiased $F(z_1)=\mathbb{E}\left[F_{i}(z_1)\right]$ and is $L$-Lipschitz in mean:

$$
\mathbb{E}_{i}\left[\left\|F_{i}(z_1)-F_{i}(z_2)\right\|^2\right] \leq L^2\|z_1-z_2\|^2
$$
for any \( z_1, z_2 \in  \mathcal{Z} \).  
\end{assumption}

Note that $F$ can be expressed as a finite sum, $F = \frac{1}{n}\sum_{i=1}^n F_i$, where each component $F_i$ is $L_i$-Lipschitz continuous, and the full operator $F$ is $L_F$-Lipschitz. By applying the triangle inequality, it naturally follows that $L_F \leq \frac{1}{n}\sum_{i=1}^n L_i$. On the one hand, the sum $\frac{1}{n}\sum_{i=1}^n L_i$ can be significantly larger than $L_F$. On the other hand, while computing each individual $L_i$ may be straightforward, determining the exact value of $L_F$ might not be feasible. In such cases, the inequality provides a practical upper bound for $L_F$. 

Even in the general form, the problem demonstrates potential issues caused by suboptimal stochastic oracles. If the Lipschitz constant $L$ of our stochastic oracle is significantly worse (i.e., larger) than $L_F$, it can negate the benefits of using inexpensive stochastic oracles. In what follows, for finite-sum problems, we assume that Lipschitz constants are similar for two arbitrary oracles from this sum.

\begin{assumption} \label{ass:monotonicity}
\textbf{(Monotonicity conditions.)}  
We need two cases of monotonicity:  
\begin{itemize}  
    \item[(a)] \label{ass:str_monotone}  
 \textbf{Strong monotonicity:} Operator \( F \) is \( \mu \)-strongly monotone, i.e.,  
    \[
    \langle F(z_1) - F(z_2), z_1 - z_2\rangle \geqslant \mu\|z_1 - z_2\|^2
    \]  
    for any \( z_1, z_2 \in  \mathcal{Z} \).  
    \item[(b)] \label{ass:monotone}  \textbf{Monotonicity:} Operator \( F \) is monotone, i.e.,  
    \[
    \langle F(z_1) - F(z_2), z_1 - z_2\rangle \geqslant 0
    \]  
    for any \( z_1, z_2 \in  \mathcal{Z} \).  
\end{itemize}  
\end{assumption}

% \begin{assumption}\label{ass:bound}
%     Each stochastic operator $F_i$ and full operator $F$ is bounded at the point of the solution $z^*\in\text{dom}~ g$, i.e. $\mathbb E \|F_i(z^*)\|^2 \leqslant \sigma_*^2, \|F(z^*)\|^2 \leqslant \sigma_*^2$.  
% \end{assumption}
For minimization problems, Assumption \hyperref[ass:str_monotone]{2(a)} means strong convexity, and for saddle point problems, strong convexity--strong concavity. At the same time, variance reduction methods are usually considered under Assumption \eqref{ass:lip} or its analogues, such as $L$-smoothness in the worst-case scenario. In light of these facts, our assumptions are classic for such problems. 

\section{Algorithms and convergence analysis}\label{sec:main}

Having established the necessary background, we can now proceed to the main theoretical contribution of our paper. Let us start with our Algorithm \ref{alg:egsarah} (\textsc{ExtraPAGE}). 

\begin{algorithm}
\caption{\textsc{ExtraPAGE}}\label{alg:egsarah}
\begin{algorithmic}[1]
    \State \textbf{Input:} Initial points $z^{-\nicefrac{1}{2}}=z^0\in\mathbb{R}^d$; Initial gradient $G^{-1} = F(z^{-\nicefrac{1}{2}})$
    \State \textbf{Parameter:} Stepsize $\gamma > 0$, probability $p \in (0, 1]$
    \For {$t = 0, 1, 2, \ldots, T-1$}
    \State \label{line:half_integer_point_update} $z^{t+\frac{1}{2}} = z^t - \gamma G^{t-1}$
    \State \label{line:update_rool} $G^t = \begin{cases}
        F(z^{t+\nicefrac{1}{2}}), &\hspace{-2mm} p \\
        G^{t-1} + F_{i^t}(z^{t + \nicefrac{1}{2}}) - F_{i^t}(z^{t - \nicefrac{1}{2}}), &\hspace{-2mm} 1 - p
    \end{cases}$
    \State \label{line:integer_point_updete} $z^{t+1} = z^t - \gamma G^t$
    \EndFor
\State \textbf{Output:} $z^T$
\end{algorithmic}
\end{algorithm}

Line \ref{line:update_rool} demonstrates that \textsc{ExtraPAGE} encapsulates \textsc{PAGE} update rule principle. In particular, the oracle uses information about the operator from previous iterations in order to reach a variance reduction effect. At the same time, it does not apply reference point concept, instead of this we use probabilistic approach, defining \[G^t = F(z^{t+\nicefrac{1}{2}})\] with probability $p$ and \[G^t = G^{t-1} + F_{i^t}(z^{t + \nicefrac{1}{2}}) - F_{i^t}(z^{t - \nicefrac{1}{2}})\] with probability $1-p$. It means, we compute full gradient every $\frac{1}{p}$ iterations in average. Therefore, using $p \sim \frac{1}{n}$ significantly reduces oracle complexity compared to classical \textsc{GD}, loosing to \textsc{SGD} in computational iteration cost only by a constant factor. 

Let us now recall the classical \textsc{ExtraGradient} step \eqref{eq:egstep} and pay special attention on how we adapt it to our case. As in the vanilla \textsc{ExtraGradient} method, we use $z^{t+\nicefrac{1}{2}}$, computed in Line \ref{line:half_integer_point_update}. One can note that we use a new computed reduced gradient to perform the main step of the method, and the previous one to find an extrapolation point. In this way, we accurately adapt the variance reduction idea to the \textsc{Extragradient} technique. However, in contrast to vanilla Stochastic \textsc{ExtraGradient}, our algorithm updates the gradient estimator only once per iteration. The theoretical analysis has revealed that the second gradient estimator update makes the recursion formulation more complex without yielding better convergence guarantees.

It is worth noting that \textsc{SARAH} lacks an essential feature of unbiasedness in stochastic operators compared to the \textsc{SVRG} algorithm:
\begin{equation*}
    \mathbb E_{i^t}\left[G_{i^t}(z^t)\right] \neq \frac{1}{n}\sum\limits_{i = 1}^n F_{i}(z^t) = F(z^t).
\end{equation*}
This limitation results in a more complex analysis and requires non-standard techniques to establish convergence. More particularly, the only remaining tool for theoretical analysis is evaluation of terms under the expectation, with explicit use of algorithm iterations.

Now we turn to the formal analysis. First, we would like to provide a brief discussion. In stochastic optimization, computational cheap gradient estimators are used instead of gradients. This way, stochastic algorithms reach acceleration in terms of iteration cost, but to remain its iterations effective, we strive to minimize the difference between the estimator and original gradient. As mentioned earlier, the concept of variance reduction methods is to collect information from previous iterations and use it to improve the quality of the gradient estimation at the current point. Subsequently, controlling the difference between the original gradient and its estimator is a key consideration in the development of an effective stochastic method. Our algorithm enables a recursive analysis of the squared norm of this difference, which is formalized in Lemma \ref{lemma}. This lemma is pivotal not only for deriving convergence guarantees but also for gaining fundamental insights into variance reduction techniques.

\begin{lemma}\label{lemma}
    \textit{For iterations of Algorithm \ref{alg:egsarah} the following inequality holds:}

    \begin{align*}
    &\E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 
    = \\
    &(1-p)\bigg[\E_{i^t}\left\|F_{i^t}\left(z^{t+\nicefrac{1}{2}}\right) 
    - F_{i^t}\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2
     \\ &+\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 
    -\left\|F\left(z^{t+\nicefrac{1}{2}}\right)- F\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2 \bigg].
    \end{align*}

%    \begin{align*}
%    \textstyle{\E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2} &\textstyle{=(1-p)\biggl[\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2} \\ \textstyle{}&~~\textstyle{+\E_{i^t}\left\|F_{i^t}\right(z^{t+\nicefrac{1}{2}}\left) - F_{i^t}\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2} \\
%        &~~\textstyle{-\left\|F\left(z^{t+\nicefrac{1}{2}}\right)- F\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2 \biggr].}
%    \end{align*}
\end{lemma}

If $F = \frac{1}{n}\sum\limits_{i=1}^nF_i$ represents the loss of the model on homogeneous data, the first term in brackets tends to be small, which means remaining the estimation quality during the executing of the algorithm. In our setting, Assumption \ref{ass:lip} provides a further estimate:
\begin{align*}
        \E\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 &\leq (1-p)\E\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 \\
        &~~+(1-p)L^2\E\left\|z^{t+\nicefrac{1}{2}} - z^{t-\nicefrac{1}{2}}\right\|^2\hspace{-1mm}.
\end{align*}

At this point, we introduce our main theoretical result. Biasedness not only complicates the analysis but also affects the convergence criteria. Theorem \ref{theorem} provides convergence guarantees for the \textsc{ExtraPAGE} algorithm based on a specifically constructed function that ensures the stability of the method: 
\begin{align*}
  V^t &= \E\Big[\left\|z^t - z^*\right\|^2  + \gamma^2H\left\|F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2 \\
  &~~~~~~~~~+ 2\gamma M\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle \\
  &~~~~~~~~~~~~~+\gamma^2\left\|G^{t}-G^{t-1}\right\|^2 
        \Big]  ,
\end{align*}
where $M = \frac{1-p}{p-\gamma \mu}$ and $H = 70n^3$.
Taking into account the choice of the stepsize $\gamma$ and probability $p$ in Theorem \ref{theorem}, $M$ can be estimated as $M \sim n$. We outline the third term in the Lyapunov function $V^t$, which represents a scalar product. Such a term can be negative and is unusual for Lyapunov functions in variational inequality problems.


\begin{theorem}
    \label{theorem}
    \textit{Under Assumptions \ref{ass:lip}, \hyperref[ass:str_monotone]{2(a)}, after $T$ iterations of Algorithm \ref{alg:egsarah} with $\gamma\leqslant \frac{1}{30Ln^{\nicefrac{3}{2}}}, p=\frac{1}{n}$, the following holds:}
    \begin{eqnarray*}
        V^T \leqslant \left(1 - \gamma\mu\right)^{T} \left\|z^0 - z^*\right\|^2.
    \end{eqnarray*}
\end{theorem}

Thus, we established the linear convergence of Algorithm \ref{alg:egsarah} with respect to the function $V$. It is important to outline that $V$ includes terms containing the difference $F(z^{t+\nicefrac{1}{2}}) - G^t$, which imposes tight restrictions on $\left\|F(z^{t+\nicefrac{1}{2}}) - G^t\right\|$.This suggests that $G^t$ provides a sufficiently accurate approximation of $F(z^{t+\nicefrac{1}{2}})$, thereby validating the effectiveness of our choice for the update rule of $G^t$.

Corollary \ref{corollary_usual_convergance} reflects the superiority of the obtained guarantees based on the function $V^t$ over the usual criterion $\left\|z^t - z^*\right\|^2$, which is not obvious due to the possible negativity of the scalar product $\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle$.

\begin{corollary}\label{corollary_usual_convergance}
    \textit{In settings of Theorem \ref{theorem}, after $T$ iterations of Algorithm \ref{alg:egsarah} with $\gamma\leqslant \frac{1}{30Ln^{\nicefrac{3}{2}}}$ and $ p=\frac{1}{n}$, the following holds:}
    \begin{align*}
        \textstyle{\E\Big[\frac{1}{2}\|z^T - z^*\|^2 }
     &\textstyle{+\frac{\gamma^2 H}{2}\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2\Big]} \\
     &\textstyle{\leq \left(1 - \gamma\mu\right)^{T} \left\|z^0 - z^*\right\|^2 .}
    \end{align*}
\end{corollary}


As a final point of our theoretical analysis, we introduce Corollary \ref{corollary}.

\begin{corollary}\label{corollary}
    Suppose Assumptions \ref{ass:lip}, \hyperref[ass:str_monotone]{2(a)} hold. Then Algorithm \ref{alg:egsarah} with $\gamma = \frac{1}{30Ln^{\nicefrac{3}{2}}}$ and $p = \frac{1}{n}$, to reach $\varepsilon$-accuracy, where $\varepsilon \sim V^T$, needs
    \begin{equation*}
    \mathcal{O}\left(\frac{Ln^{\nicefrac{3}{2}}}{\mu}\log\frac{1}{\varepsilon} \right)~~\text{iterations and oracle calls.}
    \end{equation*}
\end{corollary}
Let us briefly discuss the result. Comparing our result with other estimates in this class, under Assumptions \ref{ass:lip} and \hyperref[ass:str_monotone]{2(a)}, our algorithm has a worse dependence on \( n \). Nevertheless, this phenomenon is explainable. The convergence analysis of variance reduction methods that are not unbiased, particularly in the extragradient setting, inherently introduces an inner product term of the form $\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle$ in the function governing the convergence rate. This inner product significantly complicates the analysis. Placing this term in the recursion leads to the emergence of an additional degree of $ n^{\frac{1}{2}} $. However, this can be seen as a trade-off -- a chance to obtain superior practical convergence at the cost of gradient bias.

\begin{remark}\label{rem:eg}
    We can transform the obtained estimation for the case of monotone stochastic operators \hyperref[ass:monotone]{2(b)} acting on a bounded domain of diameter $D$. To do this, we use a regularization trick with $\mu\sim \frac{\varepsilon}{D^2}$. Thus, solving the problem with the operator $\hat{F}(z) = F(z) + \mu(z-z^0)$ with the accuracy $\frac{\varepsilon}{2}$, we solve the problem \eqref{eq:vi_setting} with the accuracy $\varepsilon$ and obtain $\mathcal{\widetilde{O}}\left(\frac{Ln^{\nicefrac{3}{2}}}{\varepsilon} \right)$ iteration and oracle complexity. This is convergence in argument, it differs from the classical form.
\end{remark}


\section{Experiments}\label{sec:experiments}

Our experimental evaluation spans a diverse set of tasks, illustrating the effectiveness of \textsc{ExtraPAGE} in various practical settings. The structure of this section is as follows:

- \textbf{Analysis on Toy Bilinear Problems (Section \ref{sec:bilinear}):} 
    We begin with an evaluation of \textsc{ExtraPAGE}'s performance on synthetic bilinear problems. These controlled experiments serve as a baseline for comparison with existing approaches.

- \textbf{Deep Learning Scalability (Section \ref{sec:resnet}):} 
    We assess the scalability and adaptability of \textsc{ExtraPAGE} by training a ResNet-18 model on the CIFAR-10 image classification. 
    
- \textbf{Practical Utility in Denoising (Section \ref{sec:denoising}):} 
    We then apply our method to image denoising — a canonical application of saddle-point optimization.

- \textbf{Performance on GAN Training (Section \ref{sec:gan}):} 
    To further validate the robustness and convergence properties of \textsc{ExtraPAGE}, we compare it against established baselines in the challenging setting of GAN training.

We compare Algorithm \ref{alg:egsarah} \textsc{ExtraPAGE} to those in the literature. Therefore, we take \textsc{Extragradient} \citep{juditsky2011solving}, \textsc{Extragradient with Variance Reduction} (\textsc{EGVR}) \citep{alacaoglu2022stochastic}, Stochastic Gradient Descent Ascent (\textsc{SGDA}) \citep{nemirovski2009robust} with and without clipping and \textsc{SPIDER-GDA} \citep{chen2022faster} algorithms as a reference. Additional experiments, including adversarial training and extended formulations discussed above, are provided in Appendix \ref{sec:additionalexp}.

% This section is structured as follows. First, we analyze \textsc{ExtraPAGE} performance on toy experiments involving bilinear problems (Section \ref{sec:bilinear}), providing a clear and intuitive demonstration of our method in controlled settings. Then we proceed with experiments on training ResNet-18 on the CIFAR-10 image classification (Section \ref{sec:resnet}), demonstrating the scalability and adaptability of our method to more complex deep learning tasks. Following this, we apply our method to another critical practical problem -- denoising, which is a widely used application of saddle-point methods, making it an excellent benchmark for evaluating practical utility (Section \ref{sec:denoising}). Next, we validate the performance of our method compared to other baselines on the GAN's training (Section \ref{sec:gan}). 
% Through these diverse experiments, we aim to provide a comprehensive evaluation that not only validates the theoretical strengths of ExtraPAGE but also highlights its real-world applicability and superiority over classical methods.
% Finally, we provide a brief overall discussion of the results obtained in Section \ref{sec:discussion}. Additionally, we highlight how \textsc{ExtraPAGE} significantly outperforms the classical \textsc{ExtraGradient} method in the challenging task of adversarial training in Appendix \ref{sec:adversarial}. 


   
\subsection{Bilinear Saddle Point Problem} \label{sec:bilinear}
We start our experiments with a distributed bilinear problem
\begin{equation} \label{problem_bilinear}
\min_{x\in \mathbb R^{d_x}} \max_{y\in \mathbb R^{d_y}} x^TAy + a^Tx + b^Ty + \frac{\lambda^2}{2}\|x\|^2 - \frac{\lambda^2}{2}\|y\|^2,
\end{equation}
where $A\in \mathbb R^{d\times d}, a,b \in \mathbb R^d$. This problem is $\lambda$-strongly convex–strongly-concave and, moreover, it is $\|A\|_2$-smooth. Therefore, this distributed problem is well suited for the primary comparison of our methods. We take $d = 100$, and in order to apply stochastic methods, we generate a set of positive definite matrices $A_i$ and vectors $a_i, b_i$ randomly. We represent matrix A as the sum of matrices $A_i$, that is, $A = \sum \nolimits_{i =  1}^n A_i$, where $n = 100$, the same operation is performed for vectors $a$ and $b$.

The experiments are carried out for matrices with the ratio of eigenvalues $\frac{L}{\mu} = 10^4$ and $\frac{L}{\mu} = 10^2$, where $L, \mu$ are the maximum and minimum eigenvalues of the matrix A, respectively. The results are presented in Figures \ref{fig:loss_10_4}, \ref{fig:loss_10_2}.

\begin{figure}[ht]
    \centering
    \begin{minipage}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{new_plots/bilinear/logloss_10_4_final_together.pdf}
        \caption{\textsc{ExtraPAGE} compared to different baselines on the problem \ref{problem_bilinear} with $\frac{L}{\mu} = 10^4$.}
        \label{fig:loss_10_4}
    \end{minipage}
    \hfill
    \begin{minipage}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{new_plots/bilinear/logloss_10_2_together.pdf}
        \caption{\textsc{ExtraPAGE} compared to different baselines on the problem \ref{problem_bilinear} with $\frac{L}{\mu} = 10^2$.}
        \label{fig:loss_10_2}
    \end{minipage}
\end{figure}
The empirical findings reveal that \textsc{ExtraPAGE} showcases enhanced convergence when contrasted with the aforementioned algorithms. Of particular note is the performance of our methodology at huge $\nicefrac{L}{\mu}$ ratios. Although the convergence rate of $\nicefrac{L^3}{\mu^3}$ was formerly derived by \citep{chen2022faster}, this bound fails to precisely depict the real-world behavior witnessed with substantial condition number. Despite the fast convergence of the algorithm under such conditions, a considerable gap persisted between the theoretical prediction and the observed empirical outcomes. This paper bridges this shortcoming by establishing a refined, more accurate convergence rate. This rate faithfully reflects the actual performance, specifically with such large condition number, which are common in real-world scenarios.


\subsection{Image Classification} \label{sec:resnet}

We investigate the performance of our method compared to the baselines on an image classification problem. We consider the ResNet-18 model \citep{he2016deep} with the provided in this paper weight optimizers and the public CIFAR-10 \citep{krizhevsky2009learning} image dataset. To explore the robustness of the optimizers, we reformulate the standard minimization problem into the min-max optimization framework. Specifically, let 
$f(w, x, y)$  denote the loss function, where $w \in \mathbb{R}^{d_w}$ represents the model parameters, $x \in \mathbb{R}^{d_x}$ is the input, and  $y \in \mathbb{R}$ is the corresponding label. We consider the following optimization problem:
\begin{equation*}
    \min_{w \in \mathbb{R}^{d_w}} \max_{\|r_i\| \leqslant D}   \frac{1}{n} \sum \limits_{i = 1}^n f(w, x_i + r, y_i) + \frac{\lambda}{2}\|w\|^2 - \frac{\beta}{2}\|r\|^2,
\end{equation*}
where $f$ is the cross-entropy loss function, $r$ represents adversarial noise introduced to model data perturbations, and $\lambda, \beta$ are regularization parameters. 
The formulation can be expressed as a variational inequality:
\begin{equation*}
    z = \begin{pmatrix}
        w \\
        r
    \end{pmatrix}, \quad F_i(z) = \begin{pmatrix}
        \nabla_w f(w, x_i + r, y_i) + \lambda w \\
        -\nabla_r f(w, x_i + r, y_i) + \beta r
    \end{pmatrix}.
\end{equation*}
The results are presented in Figure \ref{fig:resnet}.

\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_1pn.pdf}
    % \captionof{figure}{\textsc{ExtraPAGE} compared to \textsc{Extragradient} and \textsc{EGVR} on \texttt{CIFAR} dataset.}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 100, p = \frac{1}{n}$.}
    \label{fig:resnet}
\end{figure}
\textsc{ExtraPAGE} exhibits stronger fluctuations in both accuracy and loss, yet this dynamic behavior enables it to achieve higher peak and average accuracy compared to other baselines. While the trajectory is more volatile, the algorithm consistently outperforms the alternatives, demonstrating its effectiveness for the applied image classification task. We also compare the running times for all methods in Table \ref{tab:runtime-100-1pn}.

\begin{table}[!ht]
    \caption{Runtime comparison of our algorithms with $n = 100, p = \frac{1}{n}$.}
    \label{tab:runtime-100-1pn}
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG           & 467.447 & 4.674 & 0.020 \\
        EGVR         & 618.560 & 6.186 & 0.009 \\
        SGDA         & 433.672 & 4.337 & 0.043 \\
        SGDA clipped & 438.911 & 4.389 & 0.252 \\
        SPIDER       & 841.750 & 8.417 & 0.761 \\
        ExtraPAGE    & 634.692 & 6.347 & 0.010 \\
        \bottomrule
    \end{tabular}
\end{table}
Despite being slower than the fastest baselines, \textsc{ExtraPAGE} demonstrates a reasonable epoch time while delivering superior performance in terms of accuracy, as observed in the corresponding learning curves. This suggests that its computational overhead is justified by improved convergence behavior.
In Appendix \ref{app:resnet} we further investigate the convergence and runtime values of ExtraPAGE and other baselines at different values of $n$ and $p$.

\subsection{Image denoising} \label{sec:denoising}

%%%%% DENOISING
\begin{figure*}[!ht]
    \centering
    \includegraphics[width=\textwidth]{new_plots/denoising/plot_denoising_b4.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} and other baselines convergence on image with $\sigma = 0.1$ on the problem \ref{problem_denoising}}
    \label{fig:louvre}
\end{figure*}
%%%%%%

To formulate the image denoising problem \citep{chambolle2011first}, we consider the classic saddle point problem as we demonstrate in Example \ref{ex:convconcsaddle}:
$$
\min_{x \in \mathcal{X}} \max_{y \in \mathcal{Y}} \left[\langle Kx, y \rangle + G_1(x) - G_2(y)\right],
$$
where regularizers \(G_1\) and \(G_2\) are proper convex lower semicontinuous functions, and \(K\) is a continuous linear operator. To proceed with image denoising, we consider $g$ as a given noisy image and $u$ as the solution we seek. We use the Cartesian grid with the step $h: \{(i\cdot h, j\cdot h)\}$. Thus, specifically for the image denoising, we consider:
$$
\min_{u \in \mathcal{X}} \max_{p \in \mathcal{Y}} \left[\langle \nabla u, p \rangle_{\mathcal{Y}} + \nicefrac{\lambda}{2} \|u - g\|^2_2 - \delta_{P}(p)\right],
$$
where $p$ is a dual variable, \(\delta_{P}(p)\) is the indicator function of the set \(P\) defined as: $ P = \{ p \in \mathcal{Y} : \| p(x) \| \leq 1 \}.$ The indicator function \(\delta_P(p)\) is defined as zero if \(p\) belongs to the set \(P\), and infinity otherwise. We define operator $\nabla u$ as the difference between neighboring pixels in the grid horizontally and vertically, normalizing by the step of the grid $h$. This formulation represents a saddle point problem, where we seek to minimize the first term with respect to \(u\) while simultaneously maximizing the second term with respect to \(p\). Using duality, we can write the final formulation of considering problem as
\begin{equation} \label{problem_denoising}
\min_{u \in \mathcal{X}} \max_{p \in \mathcal{Y}} \left[-\langle u, \text{div} ~p \rangle_{\mathcal{X}} + \nicefrac{\lambda}{2} \|u - g\|^2_2 - \delta_{P}(p)\right].
\end{equation}
We divide images into batches -- equal squares. We consider two options: batches of size 4 and 8 according to the grid. Since the images are black and white, they are single-channel, which means that each batch is a square matrix with non-negative integers. It is also important to note that when calculating the gradient, the edges of the batch are processed according to the rule of adding a number equal to that of the nearest neighbor.

We select two images with different levels of additive zero-mean Gaussian noise: \(\sigma = 0.05\) and \(\sigma = 0.1\). Figure \ref{fig:louvre} provides a comparison of the proposed methods on the image with  \(\sigma = 0.1\). Additional results for all methods on another image are presented in Figure \ref{fig:girl} in Appendix \ref{app:denoising}. 

Comparing the images, it can be observed that \textsc{EGVR} demonstrates strong practical performance, with results that are nearly indistinguishable to the human eye from those of \textsc{ExtraPAGE}. 
The slight difference lies in the loss behavior. We notice that \textsc{EGVR} performs slightly better than our algorithm during first epochs. Nonetheless, with continued training, the convergence rate of our algorithm surpasses superiority. Besides, while both \textsc{ExtraPAGE} and \textsc{EGVR} converge well, \textsc{ExtraPAGE} shows a smoother and more stable decline in error. In contrast, other methods struggle significantly with the problem, failing to reduce noise effectively. This can be attributed to its inherent limitations in handling variance-reduced stochastic updates, which are crucial for image denoising. Finally, compared to the original noisy image, all tested methods achieved significant noise reduction.

\subsection{GAN Training} \label{sec:gan}

Generative Adversarial Networks (GANs) represent a powerful class of models widely applied in image generation tasks. StyleGAN \citep{karras2019style} standing out for its ability to produce high-quality synthetic images. The adversarial nature of GAN training poses an $\min \max$ optimization problem, which can be effectively framed as a Variational Inequality. Thus, we explore the application of \textsc{ExtraPAGE} in training a StyleGAN model for style translation.

We utilize the \textsc{I'm something of a painter myself} dataset, which consists of two distinct domains: a set of $300$ Monet paintings and a set of $7028$ photographs. Each image is resized to \(256 \times 256\) pixels. We train the generator with an extrapolation step for both discriminators. We configure the training with a fixed learning rate $\gamma = 5 \times 10^{-5}$ and a batch size of $5$, consistent across both domains. The probability parameter $p = \frac{1}{n}$ is set based on the effective dataset size, though for computational efficiency, we adapt it to the mini-batch context. Training is conducted for multiple random initializations, specifically with random states $50$ and $57$. The results are presented in Figure \ref{fig:gan_results}.

\begin{figure}[H]
    \centering
    \begin{minipage}{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_exps/loss_G_plot_validation.pdf}
        \caption*{Generator Loss}
    \end{minipage}
    \hfill
    \begin{minipage}{0.23\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_exps/loss_DX_plot_validation.pdf}
        \caption*{Discriminator X Loss}
    \end{minipage}
    
    \vspace{0.2cm}
    
    \begin{minipage}{0.4\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_pics/image_generated.png}
        \caption*{Generated Images}
    \end{minipage}
    
    \caption{All components use \textsc{ExtraPAGE} with $\gamma = 5 \times 10^{-5}$ and batch size 5, random state $50$}
    \label{fig:gan_results}
\end{figure}

We provide additional results in Appendix \ref{app:gans}. As GANs represent one of the most prominent applications of VI algorithms in modern machine learning, \textsc{ExtraPAGE} proves its applicability to a wide range of tasks.

\section{Discussion} \label{sec:discussion}
In this paper, we present \textsc{ExtraPAGE}, a novel algorithm for solving variational inequalities (VIs) and saddle point problems (SPPs). Our method is built upon variance-reduced algorithm \textsc{PAGE} and leads to superior theoretical convergence properties compared to baselines and slightly outperform them in practice. Additionally, our work closes an important gap in the theoretical understanding of \textsc{SARAH}-based methods applied to VIs and SPPs. Specifically, we derive a complexity bound with a linear dependence on the condition number of the problem under the assumption of Lipschitzness. Future research should refine theoretical bounds to establish the optimality of our method. What is more, further investigation into adaptive stepsize strategies could enhance the applicability of the method.


\section*{ACKNOWLEDGMENTS}

The work was done in the Laboratory of Federated Learning Problems of the ISP RAS (Supported by Grant App. No. 2 to Agreement No. 075-03-2024-214).
% \newpage

\bibliography{refs}
\bibliographystyle{plainnat}
\addcontentsline{toc}{section}{References}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\onecolumn
% \newpage
\appendix
% \section*{Contents}
% \newpage
\title{When Extragradient Meets PAGE: \\Bridging Two Giants to Boost Variational Inequalities \\ (Supplementary Material) \\}
\maketitle

\allowdisplaybreaks
\tableofcontents

% \newpage 
\section{Additional experiments}\label{sec:additionalexp}

In this section, we present additional experiments that have been performed as well as the technical details for them.

% \subsection{Adversarial Training}
\subsection{Adversarial Training} \label{sec:adversarial}

We address an adversarial training problem. We can formulate it the way as in \eqref{problem_adversarial}. 
% : \begin{equation} \label{problem_adversarial}
% \begin{aligned}
% \min_{w \in \mathbb{R}^d} & \max_{\|r_i\| \leqslant D}  
% \Bigg[
% \frac{1}{2N} \sum \nolimits_{i=1}^N 
% \left( w^T(x_i+r_i) - y_i \right)^2  \\
% &~~~~~~~~~~~~~~~~ \quad\quad\quad + \frac{\lambda}{2} \|w\|^2  
% - \frac{\beta}{2} \|r\|^2
% \Bigg],
% \end{aligned}
% \end{equation}

% where the samples corresponds to features $x_i$ and targets $y_i$. 
We evaluate this issue across several datasets: \texttt{mushrooms}, \texttt{a9a}, \texttt{w8a}, and \texttt{ijcnn1}, sourced from the \textsc{LIBSVM} library \citep{chang2011libsvm}. A brief description of these datasets is provided in Table \ref{tab:dataset-summary}. The results are presented in Figure \ref{fig:adversarial}.


\begin{center}
\centering
\begin{tabular}{ccc}
\includegraphics[width=\columnwidth]{new_plots/adversarial/plot_adversarial.pdf} 
\end{tabular}
% \captionof{figure}{\textsc{ExtraPAGE} compared to \textsc{Extragradient} and \textsc{EGVR} on \texttt{mushrooms}, \texttt{a9a}, \texttt{w8a}, and \texttt{ijcnn1} datasets on the problem \eqref{problem_adversarial}.}
\captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{mushrooms}, \texttt{a9a}, \texttt{w8a}, and \texttt{ijcnn1} datasets on the problem \eqref{problem_adversarial}.}
\label{fig:adversarial}
\end{center}

As shown on plots, \textsc{ExtraPAGE} consistently outperforms other methods across all datasets (\texttt{mushrooms}, \texttt{a9a}, \texttt{w8a}, \texttt{ijcnn1}). These datasets vary in size and complexity, providing a comprehensive evaluation of our proposed algorithms in the context of adversarial training.

\begin{table}[!htbp]
    \centering
    \caption{Summary of Datasets}
    \label{tab:dataset-summary}
    \begin{tabularx}{\textwidth}{lXXX}
        \toprule
        \textbf{Name} & \begin{tabular}{@{}l@{}}
\textbf{Number of} \\
\textbf{Instances}
\end{tabular} & \begin{tabular}{@{}l@{}}
\textbf{Number of} \\
\textbf{Features}
\end{tabular} & \begin{tabular}{@{}l@{}}
\textbf{Number of} \\
\textbf{Classes}
\end{tabular} \\
        \midrule
        \texttt{mushrooms} & 8,124 & 112 & 2 \\
        \texttt{a9a} & 32,561 & 123 & 2 \\
        \texttt{w8a} & 49,749 & 300 & 2 \\
        \texttt{ijcnn1} & 49,990 & 22 & 2 \\
        \bottomrule
    \end{tabularx}
\end{table}

\subsection{Image denoising} \label{app:denoising}

In this section, we present additional experiments conducted on image denoising. Consistent with our previous findings, a notable pattern emerges: although \textsc{EGVR} initially converges faster, \textsc{ExtraPAGE} exhibits more stable convergence over time, ultimately reaching a more precise minimum. Both methods significantly outperform the vanilla \textsc{Extragradient} approach on this task. These results further reinforce the effectiveness of variance reduction techniques when applied to another image with noise level $\sigma=0.05$, underscoring their utility in solving denoising problems.

\begin{center}
\centering
\begin{tabular}{ccc}
\includegraphics[width=\textwidth]{new_plots/denoising/unified_results_2.pdf} 
\end{tabular}
\captionof{figure}{\textsc{ExtraPAGE} and other baselines convergence on image with $\sigma = 0.05$ on the problem \ref{problem_denoising} with batch sizes $\in \{4,8\}$}
\label{fig:girl}
\end{center}

\begin{center}
\centering
\begin{tabular}{ccc}
\includegraphics[width=\textwidth]{new_plots/denoising/plot_denoising_b8.pdf} 
\end{tabular}
\captionof{figure}{\textsc{ExtraPAGE} and other baselines convergence on image with $\sigma = 0.1$ on the problem \ref{problem_denoising}}
\label{fig:louvre2}
\end{center}

\subsection{Image Classification} \label{app:resnet}

This experiment was conducted on the \textsc{CIFAR-10} dataset \citep{krizhevsky2009learning}, widely used as a benchmark in optimization community, consisting of 50,000 training and 10,000 test samples. Each sample is a $32 \times 32$ RGB image associated with one of ten class labels. The experiments were implemented in Python using the PyTorch library \citep{paszke2019pytorch}, leveraging both a single CPU (Intel Xeon 2.20 GHz) and a single GPU (NVIDIA Tesla P100) for computation. 
% To emulate a distributed environment, we split batches across multiple workers, simulating a decentralized optimization setting.

The experiments are conducted with the following setup:
\begin{itemize}
    \item learning rate $\gamma = 0.01$ for all optimizers;
    \item regularization parameters $\lambda = \beta = 0.0005$.
\end{itemize}

Below, we present a series of convergence plots and corresponding runtime tables that illustrate the performance of \textsc{ExtraPAGE} under different distributed settings. Each figure shows the optimization trajectory for \textsc{ExtraPAGE} compared to baseline methods when varying the number of workers $n$ and the update probability $p$. The tables summarize total training time and per-round timing statistics for each configuration, allowing a clear comparison of efficiency and stability across all tested scenarios.

\newpage

\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_1p2n.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 100, p = \frac{1}{2n}$.}
    \label{fig:resnet-1p2n}
\end{figure}

\begin{table}[!ht]
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG           & 467.447 & 4.674 & 0.020 \\
        EGVR         & 618.560 & 6.186 & 0.009 \\
        SGDA         & 433.672 & 4.337 & 0.043 \\
        SGDA clipped & 438.911 & 4.389 & 0.252 \\
        SPIDER       & 841.750 & 8.417 & 0.761 \\
        ExtraPAGE    & 633.366 & 6.334 & 0.009 \\
        \bottomrule
    \end{tabular}
    \caption{Runtime comparison of our algorithms with $n = 100, p = \frac{1}{2n}$.}
\end{table}

\newpage
\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_2pn.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 100, p = \frac{2}{n}$.}
    \label{fig:resnet-2pn}
\end{figure}

\begin{table}[!ht]
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG           & 467.447 & 4.674 & 0.020 \\
        EGVR         & 618.560 & 6.186 & 0.009 \\
        SGDA         & 433.672 & 4.337 & 0.043 \\
        SGDA clipped & 438.911 & 4.389 & 0.252 \\
        SPIDER       & 841.750 & 8.417 & 0.761 \\
        ExtraPAGE    & 634.688 & 6.347 & 0.011 \\
        \bottomrule
    \end{tabular}
    \caption{Runtime comparison of our algorithms with $n = 100, p = \frac{2}{n}$.}
\end{table}

\newpage
\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_128_1pn.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 390, p = \frac{1}{n}$.}
    \label{fig:resnet-128-1pn}
\end{figure}

\begin{table}[!ht]
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG           & 692.169 & 6.922 & 0.292 \\
        EGVR         & 1129.090 & 11.291 & 0.013 \\
        SGDA         & 526.992 & 5.270 & 1.162 \\
        SGDA clipped & 516.918 & 5.169 & 1.154 \\
        SPIDER       & 1133.076 & 11.331 & 4.043 \\
        ExtraPAGE    & 1178.660 & 11.787 & 0.028 \\
        \bottomrule
    \end{tabular}
    \caption{Runtime comparison of our algorithms with $n = 390, p = \frac{1}{n}$.}
\end{table}

\newpage
\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_128_1p2n.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 390, p = \frac{1}{2n}$.}
    \label{fig:resnet-128-1p2n}
\end{figure}

\begin{table}[!ht]
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG           & 692.169 & 6.922 & 0.292 \\
        EGVR         & 1129.090 & 11.291 & 0.013 \\
        SGDA         & 526.992 & 5.270 & 1.162 \\
        SGDA clipped & 516.918 & 5.169 & 1.154 \\
        SPIDER       & 1133.076 & 11.331 & 4.043 \\
        ExtraPAGE    & 1177.751 & 11.778 & 0.152 \\
        \bottomrule
    \end{tabular}
    \caption{Runtime comparison of our algorithms with $n = 390, p = \frac{1}{2n}$.}
\end{table}

\newpage
\begin{figure}[!ht]
    \centering \includegraphics[width=\columnwidth]{new_plots/cifar/plot_resnet_128_2pn.pdf}
    \captionof{figure}{\textsc{ExtraPAGE} compared to different baselines on \texttt{CIFAR} dataset. We choose $n = 390, p = \frac{2}{n}$.}
    \label{fig:resnet-128-2pn}
\end{figure}
\begin{table}[!ht]
    \centering
    \begin{tabular}{l S[table-format=3.3] S[table-format=2.3]@{~$\pm$~}S[table-format=1.3]}
        \toprule
        Algorithm & {Total Time} & \multicolumn{2}{c}{Round Time} \\
        \midrule
        EG          & 692.169 & 6.922 & 0.292 \\
        EGVR        & 1129.090 & 11.291 & 0.013 \\
        SGDA        & 526.992 & 5.270 & 1.162 \\
        SGDA clipped& 516.918 & 5.169 & 1.154 \\
        SPIDER      & 1133.076 & 11.331 & 4.043 \\
        ExtraPAGE   & 1173.841 & 11.738 & 0.023 \\
        \bottomrule
    \end{tabular}
    \caption{Runtime comparison of our algorithms with $n = 390, p = \frac{2}{n}$.}
\end{table}

Based on the conducted experiments, \textsc{ExtraPAGE} consistently demonstrates robust and stable convergence regardless of the number of workers $n$ or the update probability $p$. Through experiments with varying $n$, we confirm that \textsc{ExtraPAGE} outperforms existing benchmarks in ill-conditioned problems and high condition number scenarios relative to batch count, while spending comparable time for each iteration. Parameter analysis further shows minimal sensitivity to $p$, indicating that \textsc{ExtraPAGE} effectively balances computation and communication overhead without degradation in efficiency. Overall, these results highlight \textsc{ExtraPAGE}’s stability and insensitivity to both $n$ and $p$.

\subsection{GAN Training} \label{app:gans}
In this section, we provide additional experiments with random state $57$. The experiments were held on a single GPU (NVIDIA A100). The results are presented in Figure \ref{fig:gan_results_additional}.

\begin{figure}[H]
    \centering
    \begin{minipage}{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_exps/loss_G_plot_validation_old.pdf}
        \caption*{Generator Loss}
    \end{minipage}
    \hfill
    \begin{minipage}{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_exps/loss_DX_plot_validation_old.pdf}
        \caption*{Discriminator X Loss}
    \end{minipage}
    \hfill
    \begin{minipage}{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{new_plots/gan_exps/loss_DY_plot_validation_old.pdf}
        \caption*{Discriminator Y Loss}
    \end{minipage}

    
    \caption{All components use \textsc{ExtraPAGE} with $\gamma = 5 \times 10^{-5}$ and batch size 5, random state $57$}
    \label{fig:gan_results_additional}
\end{figure}



% \newpage
\section{General Inequalities}\label{sec:basicineq}
First, we mention important inequalities that are used in further proofs. Consider a function \( f \) satisfying \text{Assumption~\ref{ass:lip}}. Then for any \( n \) in the real numbers and for all vectors \( x, y, x_i \) in \( \mathbb{R}^n \) with a positive scalar \( c \), the following inequalities hold.
\begin{align}
\notag\\ \hline \notag\\
\label{bi:young} \tag{Young} \left|\left\langle x, y \right\rangle\right| \quad & \leqslant \quad \frac{\|x\|^2}{2c} + \frac{c \|y\|^2}{2} \\
\notag\\ \hline \notag\\
\label{bi:norm} \tag{Norm}
\begin{split}
 -\left\langle x, y \right\rangle \quad &= \quad -\frac{\|x\|^2}{2} - \frac{\|y\|^2}{2} + \frac{\|x-y\|^2}{2} \\
 \|x + y\|^2 \quad &= \quad \|x\|^2 + \|y\|^2 + 2\left\langle x, y \right\rangle
\end{split}\\
\notag\\ \hline \notag\\
\label{bi:lsmoothness} \tag{Lip} 
\begin{split}
    \left\|\nabla f(x) - \nabla f(y)\right\|^2 \quad & \leqslant \quad L^2 \|x-y\|^2\\
    \quad f(x) \quad & \leqslant \quad f(y) + \left\langle \nabla f(y), x-y \right\rangle + \frac{L}{2} \|x-y\|^2\\
    f(x) \quad & \leqslant \quad f(y) - \left\langle \nabla f(x), y-x \right\rangle - \frac{1}{2L} \|\nabla f(x)-\nabla f(y)\|^2
\end{split}\\
\notag\\ \hline \notag\\
\label{bi:CauchySchwarz} \tag{CS}
\begin{split}
\left\|\sum_{i=1}^{n} x_i\right\|^2 \quad& \leqslant  \quad n \sum_{i=1}^{n} \|x_i\|^2  \\
\left\|x + y \right\|^2 \quad& \leqslant  \quad (1 + c)\|x\|^2 + \left(1 + \frac{1}{c}\right)\|y\|^2
\end{split}
\\
\notag\\ \hline \notag
% \label{bi:Jensen} \tag{Jen}  \varphi \left(\frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i}\right) \quad & \leqslant \quad \frac{\sum_{i=1}^{n} w_i \varphi(x_i)}{\sum_{i=1}^{n} w_i} \\
% \notag\\ \hline \notag
% \\
% \label{bi:PL} \tag{PL}  g(x) - \inf g   \quad & \leqslant \quad \frac{1}{2\mu} \|\nabla g(x)\|^2 \\
% \notag\\ \hline \notag 
\end{align}



% \newpage
\section{Proof} \label{app:egsarah}
In this section, we provide all the necessary proofs. First, we prove Lemmas \ref{lemma}, \ref{lemma_zplus} and \ref{lemma_descent}. Then, we use them to establish the main Theorem \ref{theorem}, which guarantees the convergence of our algorithm. Finally, we derive Corollaries \ref{corollary_usual_convergance} and \ref{corollary}.

We begin with Lemma \ref{lemma}, which reflects the change in the quality of the gradient estimation from one iteration to the next. This lemma is crucial not only for the subsequent analysis but also has independent significance in developing intuition about variance reduction methods.

\textbf{Lemma \ref{lemma}.}
\textit{For iterations of Algorithm \ref{alg:egsarah} the following equation holds:}
\begin{align*}
\E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 = (1-p)\left[\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 \right.&\left.+\E_{i^t}\left\|F_{i^t}\left(z^{t+\nicefrac{1}{2}}\right) - F_{i^t}\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2 \right. \\
&\left.-\left\|F\left(z^{t+\nicefrac{1}{2}}\right)- F\left(z^{t-\nicefrac{1}{2}}\right)\right\|^2 \right].
\end{align*}
\begin{proof} 
We examine the following term using the update rule (Line \ref{line:update_rool}). We take the expectation over $G^{t-1}$:
\begin{eqnarray*}
    \E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 &=& (1-p)\left\|F(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 \\
    &=& (1-p)\left\|F(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}) - F(z^{t-\nicefrac{1}{2}}) + F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2\\
    &=& (1-p)\left\|F(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}) - F(z^{t-\nicefrac{1}{2}})\right\|^2 \\
    & & +~ (1-p) \left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2\\
    & & +~ 2(1-p)\left\langle F(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}) - F(z^{t-\nicefrac{1}{2}}), F(z^{t-\nicefrac{1}{2}}) - G^{t-1} \right\rangle.
    \end{eqnarray*}  
Taking the expectation over $i^t$ and utilizing $\mathbb E_{i^t} \left[F_{i^t}(z)\right] = F(z)$, we derive that the scalar product is equal to zero. We continue the equation:
    \begin{eqnarray*}
    \E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 &=& (1-p)\E_{i^t}\left\|F(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}) - F(z^{t-\nicefrac{1}{2}})\right\|^2 \\ 
    && +~ (1-p)\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 \\
    &=& (1-p)\left\|F(z^{t+\nicefrac{1}{2}})- F(z^{t-\nicefrac{1}{2}})\right\|^2 + (1-p)\E_{i^t}\left\|F_{i^t}(z^{t+\nicefrac{1}{2}}) - F_{i^t}(z^{t-\nicefrac{1}{2}})\right\|^2 \\ 
    && -~2(1-p)\E_{i^t}\left\langle F(z^{t+\nicefrac{1}{2}})-F(z^{t-\nicefrac{1}{2}}), F_{i^t}(z^{t+\nicefrac{1}{2}}) - F_{i^t}(z^{t-\nicefrac{1}{2}})\right\rangle \\ 
    &&+~ (1-p)\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2 \\
    &=& -(1-p)\left\|F(z^{t+\nicefrac{1}{2}})- F(z^{t-\nicefrac{1}{2}})\right\|^2 \\ 
    &&+~ (1-p)\E_{i^t}\left\|F_{i^t}(z^{t+\nicefrac{1}{2}}) - F_{i^t}(z^{t-\nicefrac{1}{2}})\right\|^2 + (1-p)\left\|F(z^{t-\nicefrac{1}{2}}) - G^{t-1}\right\|^2.
\end{eqnarray*}    
This finishes the proof of the lemma.
\end{proof}

We emphasize that Lemma \ref{lemma} is formulated as an equation without any assumptions on the operator $F$, making it an exact and general estimate. Now, we proceed to the proof of Lemma \ref{lemma_zplus} which is a general statement of Euclidean geometry and serves as the first step in deriving the recursion.

\begin{lemma}\label{lemma_zplus}
    \textit{Let $z,y \in \mathbb{R}^d, z^+ = z-y$. Then for all $u \in \mathbb{R}^d$ the following equation holds:}
    \begin{align*}
        \|z^+ - u\|^2 = \|z - u\|^2 - 2\langle y, z^+ - u \rangle - \|z^+ - z\|^2.
    \end{align*}
\end{lemma}
\begin{proof}
We transform the left part as follows:
\begin{eqnarray*}
        \|z^+ - u\|^2 &=&  \|z^+ -z + z - u\|^2 \\
        &=& \|z - u\|^2 + 2\langle z^+ - z, z - u \rangle + \|z^+ - z\|^2 \\
        &=& \|z - u\|^2 + 2\langle z^+ - z, z^+ - u \rangle - \|z^+ - z\|^2 \\
        &=& \|z - u\|^2 - 2\langle y, z^+ - u \rangle - \|z^+ - z\|^2.
    \end{eqnarray*}
This finishes the proof of the lemma.
\end{proof}

Lemma \ref{lemma_descent} serves as the final prerequisite for the main analysis in Theorem \ref{theorem} and relies on the assumption of the strong monotonicity of the operator $F$ to derive a recursion for the term $\left\|z^t-z^*\right\|^2$.

\begin{lemma}[Descent lemma]\label{lemma_descent}
    \textit{Under Assumption \hyperref[ass:str_monotone]{2(a)}, after $T$ iterations of Algorithm \ref{alg:egsarah} the following equation holds:}
    \begin{eqnarray*}
    \left\|z^{t+1} - z^*\right\|^2 &=& (1-\gamma \mu)\left\|z^t-z^*\right\|^2 - (1-2\gamma\mu))\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2\\
    &&+~2\gamma\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}}-z^*\right\rangle + \gamma^2\left\|G^{t}-G^{t-1}\right\|^2.
\end{eqnarray*}
\end{lemma}
\begin{proof}
    We substitute $z = z^t, y = \gamma G^{t}, z^+ = z^{t+1}, u = z^{*}$, and $z = z^t, y = \gamma G^{t-1}, z^+ = z^{t+\nicefrac{1}{2}}, u = z^{t+1}$ into Lemma \ref{lemma_zplus} and summing the obtained equations. It yields
    \begin{eqnarray}
        \notag\left\|z^{t+1} -z^* \right\|^2 + \left\|z^{t+\nicefrac{1}{2}} -z^{t+1} \right\|^2 &=& \left\|z^{t} -z^* \right\|^2 - \left\|z^{t+\nicefrac{1}{2}} -z^{t} \right\|^2\\
        &&-~2\gamma\left\langle G^t,z^{t+1} - z^*\right\rangle - 2\gamma\left\langle G^{t-1},z^{t+\nicefrac{1}{2}} - z^{t+1}\right\rangle \notag\\
        &=& \left\|z^{t} -z^* \right\|^2 - \left\|z^{t+\nicefrac{1}{2}} -z^{t} \right\|^2 \notag\\
        \label{eq:lemma_descent_1}&&-~2\gamma\left\langle G^t,z^{t+\nicefrac{1}{2}} - z^*\right\rangle - 2\gamma\left\langle G^{t-1} - G^t,z^{t+\nicefrac{1}{2}} - z^{t+1}\right\rangle.
    \end{eqnarray}
    Now we examine the first scalar product:
    \begin{eqnarray*}
        -2\gamma\left\langle G^t,z^{t+\nicefrac{1}{2}} - z^*\right\rangle = 2\gamma\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}} - z^*\right\rangle - 2\gamma\left\langle F(z^{t+\nicefrac{1}{2}}),z^{t+\nicefrac{1}{2}} - z^*\right\rangle.
    \end{eqnarray*}
    Under the setting \eqref{eq:vi_setting} and the strong monotonicity (Assumption \hyperref[ass:monotone]{2(a)}), the last term transforms into
    \begin{eqnarray*}
        - 2\gamma\left\langle F(z^{t+\nicefrac{1}{2}}),z^{t+\nicefrac{1}{2}} - z^*\right\rangle
        &=& - 2\gamma\left\langle F(z^{t+\nicefrac{1}{2}}) - F(z^*),z^{t+\nicefrac{1}{2}} - z^*\right\rangle - 2\gamma\left\langle F(z^*), z^{t+\frac{1}{2}} - z^* \right\rangle \\
        &\leq& - 2\gamma\mu \left\|z^{t+\nicefrac{1}{2}} - z^* \right\|^2 \leq -\gamma\mu \left\|z^{t} - z^* \right\|^2 + 2\gamma\mu \left\|z^{t+\nicefrac{1}{2}}- z^t \right\|^2,
    \end{eqnarray*}
    where the last inequality utilizes \eqref{bi:CauchySchwarz}. Finally, we obtain
    \begin{eqnarray}\label{eq:lemma_descent2}
        -2\gamma\left\langle G^t,z^{t+\nicefrac{1}{2}} - z^*\right\rangle \leq 2\gamma\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}} - z^*\right\rangle -\gamma\mu \left\|z^{t} - z^* \right\|^2 + 2\gamma\mu \left\|z^{t+\nicefrac{1}{2}}- z^t \right\|^2.
    \end{eqnarray}
    Now it is sufficient to note that according to Lines \ref{line:integer_point_updete} and \ref{line:half_integer_point_update},  $\gamma(G^{t-1}-G^t) = z^{t+1} - z^{t+\nicefrac{1}{2}}$, which transforms the second scalar product in \eqref{eq:lemma_descent_1} into
    \begin{eqnarray}\label{eq:lemma_descent3}
        - 2\gamma\left\langle G^{t-1} - G^t,z^{t+\nicefrac{1}{2}} - z^{t+1}\right\rangle = 2 \gamma^2\left\| G^t - G^{t-1} \right\|^2.
    \end{eqnarray}
    Substituting \eqref{eq:lemma_descent3} and \eqref{eq:lemma_descent2} into \eqref{eq:lemma_descent_1} finishes the proof of the lemma.
\end{proof}

Now, we are fully prepared to present the main analysis in Theorem \ref{theorem}. We begin with  Lemma \ref{lemma_descent}, deriving recursive relations for its terms, including those representing scalar products. Such terms can be negative and impose significant constraints.

\textbf{Theorem \ref{theorem}.}
\textit{Under Assumptions \ref{ass:lip}, \hyperref[ass:str_monotone]{2(a)}, after $T$ iterations of Algorithm \ref{alg:egsarah} with $\gamma\leqslant \frac{1}{30Ln^{\nicefrac{3}{2}}}, p=\frac{1}{n}$, the following inequality holds:}
    \begin{align*}
        \E\left[\left\|z^T - z^*\right\|^2 \right.&\left.+ \gamma^2\left\|G^{T-1}-G^{T-2}\right\|^2 + 2\gamma M\left\langle F(z^{T-\nicefrac{1}{2}})-G^{T-1},z^{T-\nicefrac{1}{2}}-z^*\right\rangle \right.\\
        &\left.+ \gamma^2H\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2 \right] \leqslant \left(1 - \gamma\mu\right)^{T} \left\|z^0 - z^*\right\|^2,
    \end{align*}
    \textit{where} $M = \frac{1-p}{p-\gamma \mu}$ \textit{and} $H = 70n^3$.

\begin{proof}
We start with the result provided by Lemma \ref{lemma_descent}:
\begin{eqnarray*}
    \left\|z^{t+1} - z^*\right\|^2 &=& (1-\gamma \mu)\left\|z^t-z^*\right\|^2 - (1-2\gamma\mu)\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2\\
    &&+2\gamma\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}}-z^*\right\rangle + 2\gamma^2\left\|G^{t}-G^{t-1}\right\|^2.
\end{eqnarray*}
Taking the expectation over $G^{t-1}$ and $i^t$ from the both sides of the inequality,
\begin{eqnarray}
\label{eq:theorem_main:z}
    \E_{i^t}\E_{G^{t-1}}\left\|z^{t+1} - z^*\right\|^2 &=& (1-\gamma \mu)\left\|z^t-z^*\right\|^2 - (1-2\gamma\mu)\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2\\
    &&\notag +2\gamma\E_{i^t}\E_{G^{t-1}}\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}}-z^*\right\rangle + 2\gamma^2\E_{i^t}\E_{G^{t-1}}\left\|G^{t}-G^{t-1}\right\|^2.
\end{eqnarray}
There let us consider the obtained in \eqref{eq:theorem_main:z} terms separately. First,
\begin{eqnarray*}
\E_{i^t}\E_{G^{t-1}}\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}}-z^*\right\rangle &=& p\left\langle F(z^{t+\nicefrac{1}{2}})-F(z^{t+\nicefrac{1}{2}}),z^{t+\nicefrac{1}{2}}-z^*\right\rangle \\
&& +~ (1-p)\left\langle  F(z^{t+\nicefrac{1}{2}})-G^{t-1}, z^{t+\nicefrac{1}{2}}-z^* \right\rangle\\
&& +~ (1-p)\E_{i^t}\left\langle -F_{i^t}(z^{t+\nicefrac{1}{2}}) + F_{i^t}(z^{t-\nicefrac{1}{2}}),z^{t+\nicefrac{1}{2}}-z^*\right\rangle\\
&=& (1-p)\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t+\nicefrac{1}{2}}-z^*\right\rangle\\
&=& (1-p)\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle\\
&& +~ (1-p)\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\rangle.
\end{eqnarray*}
Using the \eqref{bi:young}'s inequality, we get
\begin{align}
\notag 2\gamma\E_{i^t}\E_{G^{t-1}}\left\langle F(z^{t+\nicefrac{1}{2}})-G^t,z^{t+\nicefrac{1}{2}}-z^*\right\rangle \leq& 2\gamma(1-p)\E\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle + \frac{1-p}{c}\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 \\
\label{eq:theorem_main:S}& +~ c(1-p)\gamma^2\left\|F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2,
\end{align}
where $c$ we define later. Now we focuse on the last term of \eqref{eq:theorem_main:S}. Lemma \ref{lemma} provides:
\begin{eqnarray*}
    \E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 \leq (1-p)\E_{i^t}\left\|F_{i^t}(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t-\nicefrac{1}{2}})\right\|^2 + (1-p)\left\|F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2.
\end{eqnarray*}
Using Assumption \ref{ass:lip},
\begin{eqnarray} \label{eq:theorem_main:g}
    \E_{i^t}\E_{G^{t-1}}\left\|F(z^{t+\nicefrac{1}{2}})-G^{t}\right\|^2 & \leq&  (1-p)L^2\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 + (1-p)\left\|F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2. 
\end{eqnarray}
Then we reflect on the last term in \eqref{eq:theorem_main:z} and, making similar transformations, obtain
\begin{eqnarray}
   \E_{i^t}\E_{G^{t-1}}\left\|G^{t}-G^{t-1}\right\|^2 &=& (1-p)\E_{i^t}\left\| F_{i^t}(z^{t+\nicefrac{1}{2}})-F_{i^t}(z^{t-\nicefrac{1}{2}})\right\|^2 + p\left\| F(z^{t+\nicefrac{1}{2}})-G^{t-1}\right\|^2  \notag \\
   &\overset{\text{Ass. \ref{ass:lip}}}{\leq}& (1-p)L^2\left\| z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 + p\left\| F(z^{t+\nicefrac{1}{2}}) -F(z^{t-\nicefrac{1}{2}}) + F(z^{t-\nicefrac{1}{2}}) -G^{t-1}\right\|^2  \notag \\
  &\overset{\eqref{bi:CauchySchwarz},\text{Ass. \ref{ass:lip}}}{\leq}& (1-p)L^2\left\| z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 + 2pL^2\left\| z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2  \notag \\
    &&+~ 2p\E\left\| F(z^{t-\nicefrac{1}{2}}) -G^{t-1}\right\|^2  \notag \\
    &=& (1+p)L^2\left\| z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 + 2p\left\| F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2 \label{eq:theorem_main:g2}.
\end{eqnarray}
Note that the first term frequently appears in convergence analysis. Let us expand it in detail:
\begin{eqnarray*}
    \left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 &\overset{\eqref{bi:CauchySchwarz}}{\leq}& \left(1+\frac{1}{a}\right)\left\|z^{t+\nicefrac{1}{2}}-z^{t}\right\|^2 + \left(1+a\right)\left\|z^{t}-z^{t-\nicefrac{1}{2}}\right\|^2 \\
    &=& \left(1+\frac{1}{a}\right)\left\|z^{t+\nicefrac{1}{2}}-z^{t}\right\|^2 + \left(1+a\right)\gamma^2\left\|G^{t-1}-G^{t-2}\right\|^2.
\end{eqnarray*}
Taking expectations and applying \eqref{eq:theorem_main:g2},
\begin{eqnarray*}
\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2&\leq&
\left(1+\frac{1}{a}\right)\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{t+\nicefrac{1}{2}}-z^{t}\right\|^2 + 2p\left(1+a\right)\gamma^2 \left\| F(z^{t-\nicefrac{3}{2}})-G^{t-2}\right\|^2\\
&& +~(1+p)\left(1+a\right)L^2\gamma^2\left\| z^{t-\nicefrac{1}{2}}-z^{t-\nicefrac{3}{2}}\right\|^2.
\end{eqnarray*}
We now enter the recursion, considering that
\begin{eqnarray*}
\|z^{\nicefrac{1}{2}} - z^{-\nicefrac{1}{2}} \|^2 \leq \left(1+\frac{1}{a}\right) \|z^{\nicefrac{1}{2}} - z^0 \|^2 + (1+a)\|z^0 - z^{-\nicefrac{1}{2}} \|^2.
\end{eqnarray*}
Putting $z^{-1} = z^{-\nicefrac{1}{2}} = z^0$, as well as $G^{-1} = F(z^{-\nicefrac{1}{2}})$, we derive the estimate:
\begin{eqnarray*}
    & &\E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 \\
    &&\leq
    \left(1+\frac{1}{a}\right)\sum\limits_{k=0}^t \left((1+p)\left(1+a\right)L^2\gamma^2\right)^{t-k} \E_{i^{k}}\E_{G^{k-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{k+\nicefrac{1}{2}}-z^{k}\right\|^2\\
    && \quad+ 2p\left(1+a\right)\gamma^2 \sum\limits_{k=0}^t \left((1+p)\left(1+a\right)L^2\gamma^2\right)^{t-k} \E_{i^{k}}\E_{G^{k-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\| F(z^{k-\nicefrac{1}{2}})-G^{k-1}\right\|^2.
\end{eqnarray*}
If we choose $\gamma \leq \frac{1}{L\sqrt{2(1+p)(1+a)}}$, we get that
\begin{eqnarray}
    \notag&&\E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 \\
    \notag&&\leq
    \left(1+\frac{1}{a}\right)\sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E_{i^{k}}\E_{G^{k-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\|z^{k+\nicefrac{1}{2}}-z^{k}\right\|^2\\
    \label{eq:theorem_main:z2}&& \quad+ 2p\left(1+a\right)\gamma^2 \sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E_{i^{k}}\E_{G^{k-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left\| F(z^{k-\nicefrac{1}{2}})-G^{k-1}\right\|^2.
\end{eqnarray}
For the sake of clarity, let us redefine the terms important for analysis:
\begin{eqnarray*}
\begin{cases}
    \delta^t ~~=& \left\|z^{t} - z^*\right\|^2 ,\\
    S^t ~=& \left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle ,\\
    g^t ~~=& \left\|F(z^{t-\nicefrac{1}{2}})-G^{t-1}\right\|^2.
\end{cases}
\end{eqnarray*}
We sum \eqref{eq:theorem_main:z}, \eqref{eq:theorem_main:S} and $2\gamma^2\cdot$\eqref{eq:theorem_main:g2}, and take additional expectations. We get
\begin{eqnarray}\label{eq:theorem_main:delta}
    \notag&&\E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t}}\E_{G^{t-1}}\delta^{t+1} + 2\gamma^2\E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t}}\E_{G^{t-1}}\left\|G^{t}-G^{t-1}\right\|^2 \\
    \notag&&\leq \E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\left[ (1-\gamma \mu)\delta^t - (1-2\gamma\mu))\left\|z^{t+\nicefrac{1}{2}}-z^t\right\|^2 +(1-p)2\gamma S^t \right.\\
    && \left.\quad+ \left(\frac{1-p}{c} + 2(1+p)L^2\gamma^2\right)\left\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\right\|^2 + \left(c(1-p) + 4p\right)\gamma^2 g^t\right].
\end{eqnarray}


We proceed to analyze the convergence of the sum $V^{t} = \E \biggl[\delta^{t} + 2\gamma M S^{t} + \gamma^2Hg^{t} + 2\gamma^2\left\|G^{t}-G^{t-1}\right\|^2\biggr]$. At this stage, $\eqref{eq:theorem_main:delta} + M\cdot\eqref{eq:theorem_main:S} + \gamma^2H\cdot\eqref{eq:theorem_main:g}$ reveals:
\begin{eqnarray*}
   &&\E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\biggl[\delta^{t+1} + 2\gamma M S^{t+1} + \gamma^2Hg^{t+1} + 2\gamma^2\left\|G^{t+1}-G^{t}\right\|^2\biggr] \\
   &&\leq \E_{i^{0}}\E_{G^{-1}}\ldots\E_{i^{t-1}}\E_{G^{t-2}}\Biggl[(1-\gamma \mu) \delta^t - (1-2\gamma\mu)\E\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2 +(1-p)(M+1)2\gamma S^t \\
    && \quad+\left(\frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2 + (1-p)L^2\gamma^2 H \right)\E\bigl\|z^{t+\nicefrac{1}{2}}-z^{t-\nicefrac{1}{2}}\bigr\|^2 \\
    && \quad+ \left((1-p)(H+cM) + c(1-p) + 4p\right)\gamma^2 g^t\Biggr].
\end{eqnarray*}
Using \eqref{eq:theorem_main:z2} and taking the full expectation, 
\begin{eqnarray*}
   V^{t+1} &\leq& (1-\gamma \mu) \E\delta^t - (1-2\gamma\mu)\E\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2 +(1-p)(M+1)2\gamma \E S^t\\
    && +~ \left(\frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2 + (1-p)L^2\gamma^2 H \right) \\ &&~~~ \cdot\Biggl[\left(1+\frac{1}{a}\right)\sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E\left\|z^{k+\nicefrac{1}{2}}-z^{k}\right\|^2\\
    &&\quad~~+~ 2p\left(1+a\right)\gamma^2 \sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E g^k\Biggr] \\
    &&+~ \left((1-p)(H+cM) + c(1-p) + 4p\right)\gamma^2 \E g^t.
\end{eqnarray*}
Now we sum this over all iterations with positive coefficients $q^t$:
\begin{eqnarray} \label{eq:theorem_main:main}
    \sum\limits_{t=0}^{T-1} q^t  V^{t+1} &\leq& (1-\gamma \mu)  \sum\limits_{t=0}^{T-1} q^t \E \delta^t - (1-2\gamma\mu) \sum\limits_{t=0}^{T-1} q^t \E\bigl\|z^{t+\nicefrac{1}{2}}-z^t\bigr\|^2 \\
    && +~ (1-p)(M+1)2\gamma  \sum\limits_{t=0}^{T-1} q^t \E S^t \notag \\
    && +~ \left(\frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2 + (1-p)L^2\gamma^2 H \right) \notag \\ 
    &&~~~ \cdot\Biggl[\left(1+\frac{1}{a}\right) \sum\limits_{t=0}^{T-1} q^t \sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E\left\|z^{k+\nicefrac{1}{2}}-z^{k}\right\|^2 \notag \\
    &&\quad~~+~ 2p\left(1+a\right)\gamma^2 \sum\limits_{t=0}^{T-1} q^t \sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} \E g^k\Biggr] \notag\\
    &&+~ \left((1-p)(H+cM) + c(1-p) + 4p\right)\gamma^2  \sum\limits_{t=0}^{T-1} q^t \E g^t \notag.
\end{eqnarray}
At this stage, our task is to choose the constants $q^t, M, H$ such that the factors of identical terms in \eqref{eq:theorem_main:main} cancel out. We commence with choosing $q^t = (1-\gamma \mu)^{-t}$. 

\begin{itemize}
    \item  We proceed with taking a look at $S^t$. Our objective is to reduce terms involving $S^t, t \in \{1,2,...,T-1\}$ by equating their coefficients on the RHS and LHS:
\begin{eqnarray}\label{eq:theorem_main:M_final}
    2\gamma M q^{t-1} &=& (1-p)(M+1) 2\gamma q^t, \notag \\
    M &=& \frac{1-p}{p-\gamma \mu}.
\end{eqnarray}
For clarity, we define 
\begin{eqnarray}\label{eq:theorem_main:D}
    D &=& \frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2 + (1-p)L^2\gamma^2H.
\end{eqnarray}
\item Then we consider the coefficient at $g^t$.
\begin{align*}
    &LHS:\gamma^2 H  \sum\limits_{t=0}^{T-1} q^t g^{t+1} .\\
    &RHS:2p\left(1+a\right)D\gamma^2 \sum\limits_{t=0}^{T-1} q^t \sum\limits_{k=0}^t \left(\frac{1}{2}\right)^{t-k} g^k 
    + \left((1-p)(H+cM) + c(1-p) + 4p\right)\gamma^2  \sum\limits_{t=0}^{T-1} q^t g^t .
\end{align*}
Equating the coefficients of $g^t$, yields
\begin{eqnarray*}
    q^t \gamma^2 H &\geqslant& q^{t+1} \gamma^2\left((1-p)c+4p + (1-p)(H+cM)\right) + 2p(1+a)D\gamma^2 \underbrace{\left(\sum\limits_{k=t}^{T-1}\left(\frac{1}{2}\right)^{k-t}q^k\right)}_{q^t\left(\sum\limits_{k=0}^{T-1-t}\left(\frac{1}{2}\right)^{k}q^k\right)}.
\end{eqnarray*}
After that, we can divide the left and right sides by $ q^t \gamma^2 $:
\begin{eqnarray*}
    H &\geqslant& q \left((1-p)c+4p + (1-p)(H+cM)\right) + 2p(1+a)D \cdot\frac{1}{1-\frac{q}{2}}.
\end{eqnarray*}
Choosing $q \leq \frac{4}{3}$, we obtain $\frac{1}{1-\frac{q}{2}} \leq 3$.
In that way, recalling the definition of $D$ \eqref{eq:theorem_main:D}, we want
\begin{eqnarray}\label{eq:teorem_main:Hgeq}
    H\left(1-q(1-p)-6p(1+a)(1-p)L^2\gamma^2\right) &\geqslant& q\left(4p + \left(1-p\right)\left(M+1\right)c\right) \\
    &&+~ 6p(1+a)\left(\frac{(1-p)(1+M)}{c} + 2(1+p)L^2\gamma^2\right). \notag
\end{eqnarray}
Which enable us to omit the terms containing $g^t, t \in \{1,2,...,T-1\}$ in inequality \eqref{eq:theorem_main:main}.

\item Let us lastly dissect the coefficient with $\|z^{t+\nicefrac{1}{2}} - z^t\|^2$ to eliminate this terms from \eqref{eq:theorem_main:main}. Here we use ${\left(\sum\limits_{k=t}^{T-1}\left(\frac{1}{2}\right)^{k-t}q^k\right)} = {q^t\left(\sum\limits_{k=0}^{T-1-t}\left(\frac{1}{2}\right)^{k}q^k\right)} \leq 3q^t$ again:
\begin{eqnarray*}
    D\left(1 + \frac{1}{a}\right)\sum^{T-1}_{k = t} q^k \left(\frac{1}{2}\right)^{k-t} - q^t(1-2\gamma\mu) \leq 0 ,
\end{eqnarray*}
\begin{eqnarray*}
    3D\left(1 + \frac{1}{a}\right) \leq 1-2\gamma\mu .
\end{eqnarray*}
Substituting $D$ from the definition \eqref{eq:theorem_main:D} we obtain:
\begin{eqnarray*}
    3\left(\frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2 + (1-p)L^2\gamma^2H\right)\left(1+\frac{1}{a}\right) &\leq& 1-2\gamma\mu.
\end{eqnarray*}
Which pose further restriction on $H$:
\begin{eqnarray}\label{eq:theorem_main:Hleq}
    3H(1-p)L^2\gamma^2\left(1+\frac{1}{a}\right) \leq 1 - 2\gamma\mu - 3\left(\frac{(1-p)(M+1)}{c} + 2(1+p)L^2\gamma^2\right)\left(1+\frac{1}{a}\right).
\end{eqnarray}
\end{itemize}

For the proper $H$ to exist, that satisfy both \eqref{eq:theorem_main:Hleq} and \eqref{eq:teorem_main:Hgeq}, the following inequality should hold:
\begin{align}\label{eq:theorem_main:Hmain}
    &\frac{1}{1-q(1-p)-6p(1+a)(1-p)L^2\gamma^2}\left[q\left(4p+(1-p)(M+1)c\right) + 6p(1+a)\left(\frac{(1-p)(1+M)}{c} + 2(1+p)L^2\gamma^2\right)\right] \notag \\
    &\leq \frac{1}{3(1-p)\left(1 + \frac{1}{a}\right)L^2\gamma^2}\left[1-2\gamma\mu-3\left(\frac{(1-p)(1+M)}{c}+2(1+p)L^2\gamma^2\right)\left(1+\frac{1}{a}\right)\right]. 
\end{align}
We evaluate \eqref{eq:theorem_main:Hmain} assuming $a = 1, \gamma = \frac{1}{bLn^{\nicefrac{3}{2}}}, b\geqslant 1, n\geqslant 4, p = \frac{1}{n}$. First of all, we recall \eqref{eq:theorem_main:M_final} and derive a useful upper bound for $M+1$:
\begin{align}\label{eq:M_upper_bound}
    M+1 = \frac{1-\gamma\mu}{p-\gamma\mu} = \frac{1}{p-p\frac{\mu}{Lb\sqrt{n}}} \leq \frac{2}{p} = 2n.
\end{align}
Now our strategy is to obtain the upper bound for the LHS and lower for the RHS of \eqref{eq:theorem_main:Hmain}. Let us start with the RHS:
\begin{eqnarray*}
    RHS \geqslant \frac{b^2n^3}{6}\left[1 - \frac{2\mu}{bn^{\nicefrac{3}{2}}L}-6\left(\frac{2n}{c} + \frac{4}{b^2n^3}\right)\right] .
\end{eqnarray*}
Here we encountered with the necessity to set $c > 12n$. For further analysis we will use $c = 24n$:
\begin{eqnarray}\label{eq:theorem_main:RHS}
    RHS \geqslant \frac{b^2n^3}{6}\left[\frac{1}{2} - \frac{2}{bn^{\nicefrac{3}{2}}}-\frac{24}{b^2n^3}\right] \geqslant \frac{b^2n^3}{6}\left[\frac{1}{2} - \frac{1}{2b}-\frac{3}{8b^2}\right].
\end{eqnarray}
Then, we start evaluation of the LHS, using our choice of $q, \gamma, c, a$ and derived upper bound for $M+1$:
\begin{eqnarray}\label{eq:theorem_main:LHS}
    LHS &\leq& \frac{1-\gamma\mu}{(1-\gamma\mu) - (1-p) - (1-\gamma\mu)2p(1-p)\frac{1}{b^2n^3}}\\
    \notag &&\cdot\left[\frac{4}{3}\left(4p+(1-p)c\frac{2}{p}\right) 
    + 12p\left(\frac{(1-p)}{c}\frac{2}{p} + 2(1+p)\frac{1}{b^2 n^3})\right)\right]  \notag \\
    &\leq& \frac{1}{p\left(1 - \frac{1}{b\sqrt{n}} - \frac{12}{b^2 n^2}\right)}\left[\frac{4}{3}\left(4p+48\frac{1}{p^2}\right)+p+\frac{48p^4}{b^2}\right] \notag \\
    &\leq& \frac{n^3}{1 - \frac{1}{2b} - \frac{3}{4b^2}}\left[64 +\frac{1}{12} +\frac{1}{64} + \frac{3}{256b^2} \right].
\end{eqnarray}
Combining \eqref{eq:theorem_main:RHS} and \eqref{eq:theorem_main:LHS} we reach a sufficient condition for the existence of a solution to inequality \eqref{eq:theorem_main:Hmain}:
\begin{eqnarray*}
    \frac{1}{1 - \frac{1}{2b} - \frac{3}{4b^2}}\left[64 +\frac{1}{12} +\frac{1}{64} + \frac{3}{256b^2} \right] \leq \frac{b^2}{6}\left[\frac{1}{2} - \frac{1}{2b}-\frac{3}{8b^2}\right].
\end{eqnarray*}
Straight forward evaluation shows $b \geqslant 28.6$ is sufficient. Which means that under assumption $b \geqslant 30$ lower bound of $H$ less then upper one. To select the appropriate value for H with a compact notation, we observe that $b = 30$ implies:
\begin{eqnarray*}
    &&RHS \geqslant \frac{n^3b^2}{6}\left[\frac{1}{2} - \frac{1}{2b}-\frac{3}{8b^2}\right] = \frac{1159}{16}n^{3} \geqslant 72n^{3},\\
    &&LHS \leq \frac{n^3}{1 - \frac{1}{2b} - \frac{3}{4b^2}}\left[64 +\frac{1}{12} +\frac{1}{64} + \frac{3}{256b^2} \right] = \frac{4922801}{75456}n^{3} \leq 66n^{3}.
\end{eqnarray*}

And an increase in $b$ expands these boundaries. Therefore for all $\gamma \leq \frac{1}{30Ln^{\nicefrac{3}{2}}}$ we can set $H = 70n^3$. Finally, we claim that $\gamma \leq \frac{1}{30Ln^{\nicefrac{3}{2}}}$ satisfies all the previous constrains. 
After these preparations we can derive the necessary result from \eqref{eq:theorem_main:main}:
\begin{eqnarray*}
        &&\E\left[\left\|z^T - z^*\right\|^2 + \gamma^2\left\|G^{T-1}-G^{T-2}\right\|^2 + 2\gamma M\left\langle F(z^{T-\nicefrac{1}{2}})-G^{T-1},z^{T-\nicefrac{1}{2}}-z^*\right\rangle \right.\\
        & & \left.\quad+ \gamma^2H\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2 \right] \\
        &&\leq \left\|z^0 - z^*\right\|^2 + 2\gamma M\left\langle F(z^{-\nicefrac{1}{2}})-G^{-1},z^{-\nicefrac{1}{2}}-z^*\right\rangle + \gamma^2H\left\|F(z^{-\nicefrac{1}{2}})-G^{-1}\right\|^2 
        = \left\|z^0 - z^*\right\|^2.
    \end{eqnarray*}
This finishes the proof of the theorem.
\end{proof}

Corollary \ref{corollary_usual_convergance} highlights the advantage of the obtained guarantees based on the function $V^t$ over the conventional criterion $\left\|z^t - z^*\right\|^2$. This superiority is not immediately apparent due to the potential negativity of the scalar product $\left\langle F(z^{t-\nicefrac{1}{2}})-G^{t-1},z^{t-\nicefrac{1}{2}}-z^*\right\rangle$.

\textbf{Corollary \ref{corollary_usual_convergance}.}
\textit{In settings of Theorem \ref{theorem}, after $T$ iterations of Algorithm \ref{alg:egsarah} with $\gamma\leqslant \frac{1}{30Ln^{\nicefrac{3}{2}}}$ and $ p=\frac{1}{n}$, the following inequality holds:}
    \begin{eqnarray*}
        \E\left[\frac{1}{2}\left\|z^T - z^*\right\|^2 
     + \frac{\gamma^2 H}{2}\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2\right] \leq \left(1 - \gamma\mu\right)^{T} \left\|z^0 - z^*\right\|^2 .
    \end{eqnarray*}

    
\begin{proof}
    First of all, due to the \eqref{bi:young} inequality, we get
    \begin{eqnarray}\label{eq:cor2_S}
        &&2\gamma M\left\langle F(z^{T-\nicefrac{1}{2}})-G^{T-1},z^{T-\nicefrac{1}{2}}-z^*\right\rangle \geqslant -\frac{\gamma^2H}{2}\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2 - \frac{2M^2}{H}\left\|z^{T-\nicefrac{1}{2}}-z^*\right\|^2.
    \end{eqnarray}
    Then, we apply \eqref{bi:CauchySchwarz} to derive
    \begin{eqnarray}\label{eq:cor2_z}
        -\left\|z^{T-\nicefrac{1}{2}}-z^*\right\|^2 \geqslant -2\left\|z^{T-\nicefrac{1}{2}}-z^T\right\|^2 -2\left\|z^{T}-z^*\right\|^2 .
    \end{eqnarray}
    Recalling estimate \eqref{eq:M_upper_bound} we conclude $M \leq 2n$. Combining it with \eqref{eq:cor2_S}, \eqref{eq:cor2_z} and chosen $H = 70n^3$ we obtain
    \begin{eqnarray}
        \notag 2\gamma M\left\langle F(z^{T-\nicefrac{1}{2}})-G^{T-1},z^{T-\nicefrac{1}{2}}-z^*\right\rangle &\geqslant& -\frac{\gamma^2H}{2}\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2 - \frac{4}{35n}\left\|z^{T}-z^*\right\|^2 \\
        \label{eq:cor2_Smain}& & - \frac{4}{35n}\left\|z^T-z^{T-\nicefrac{1}{2}}\right\|^2.
    \end{eqnarray}
    After that, we examine $\left\|G^{T-1}-G^{T-2}\right\|^2$ using Lines \ref{line:half_integer_point_update} and \ref{line:integer_point_updete} of Algorithm \ref{alg:egsarah}:
    \begin{eqnarray}\label{eq:cor2_gz}
        \gamma^2\left\|G^{T-1}-G^{T-2}\right\|^2 = \left\|(z^{T-1} - z^{T})-(z^{T-1} - z^{T-\nicefrac{1}{2}})\right\|^2 = \left\|z^{T}-z^{T-\nicefrac{1}{2}}\right\|^2.
    \end{eqnarray}
    Finally, we plug \eqref{eq:cor2_gz} and \eqref{eq:cor2_Smain} into the Lyapunov function $V^t$ in the result of Theorem \ref{theorem}:
    \begin{eqnarray*}
    \left(1 - \gamma\mu\right)^{T} \left\|z^0 - z^*\right\|^2 
    &\geqslant& \E\Big[\left\|z^T - z^*\right\|^2 + \gamma ^2 \left\|G^{T-1}-G^{T-2}\right\|^2 + 2\gamma M\left\langle F(z^{T-\nicefrac{1}{2}})-G^{T-1},z^{T-\nicefrac{1}{2}}-z^*\right\rangle\\
     &&~~~~~+ \gamma^2 H\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2 \Big] \\
     &\geqslant& \E\left[\left(1-\frac{4}{35n}\right)\left(\left\|z^T - z^*\right\|^2 + \gamma^2\left\|G^{T-1}-G^{T-2}\right\|^2 \right) 
     + \frac{\gamma^2 H}{2}\left\|F(z^{T-\nicefrac{1}{2}})-G^{T-1}\right\|^2\right].
    \end{eqnarray*}
    This finishes the proof of the corollary.
\end{proof}

Now we are ready to present the final convergence estimate for our method.

\textbf{Corollary \ref{corollary}.}
    \textit{Suppose Assumptions \ref{ass:lip}, \hyperref[ass:str_monotone]{2(a)} hold. Then Algorithm \ref{alg:egsarah} with $\gamma = \frac{1}{30Ln^{\nicefrac{3}{2}}}$ and $p = \frac{1}{n}$, to reach $\varepsilon$-accuracy, where $\varepsilon \sim V^T$, needs}
    \begin{equation*}
    \mathcal{\widetilde{O}}\left(\frac{Ln^{\nicefrac{3}{2}}}{\mu}\log\frac{1}{\varepsilon} \right)~~\textit{iterations and oracle calls.}
    \end{equation*}

\begin{proof}
    Theorem \ref{theorem} guarantees that Algorithm \ref{alg:egsarah} converges to $\varepsilon$-accuracy within $\mathcal{\widetilde{O}}\left(\frac{\log{\frac{1}{\varepsilon}}}{\gamma\mu}\right)$ iterations. Setting $\gamma$ with the upper bound $\frac{1}{30Ln^{\nicefrac{3}{2}}}$ we reach $\mathcal{\widetilde{O}}\left(\frac{Ln^{\nicefrac{3}{2}}}{\mu} \log{\frac{1}{\varepsilon}} \right)$ iteration complexity. Then, we note that average iteration cost is $2(1-p) + pn =3-2p$, which implies the same bound $\mathcal{\widetilde{O}}\left(\frac{Ln^{\nicefrac{3}{2}}}{\mu} \log{\frac{1}{\varepsilon}} \right)$ for the oracle complexity.
\end{proof}

\end{document}
