% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage[usestackEOL]{stackengine}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\input{cmd1.tex}

% Use the following line for the initial blind version submitted for review:
% \usepackage{icml2023}

% If accepted, instead use the following line for the camera-ready submission:

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmic}
% \usepackage{pseudocode}

% \usepackage{algpseudocode}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\setlength {\marginparwidth }{2cm}
\usepackage[textsize=tiny]{todonotes}

\newcommand{\sX}{\R^{d_x}}
\newcommand{\sY}{\R^{d_y}}
\newcommand{\ol}[1]{\overline{#1}}
\newcommand{\ul}[1]{\underline{#1}}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\dom}{\mathrm{dom}}



\title{An Optimal Algorithm for Strongly Convex Min-Min Optimization}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Dmitry Kovalev}
\author[3,4,5]{Alexander Gasnikov}
\author[6]{Grigory Malinovsky}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
Yandex Research 
}
\affil[2]{%
Ivannikov Institute for System Programming
}
\affil[3]{%
AI Research Center, Innopolis University
  }
\affil[4]{%
Skolkovo Institute of Science and Technology (Skoltech)
  }
  \affil[5]{%
  Moscow Institute of Physics and Technology (MIPT)
  }
    \affil[6]{%
King Abdullah University of Science and Technology (KAUST)
  }

  
  \begin{document}
\maketitle

\begin{abstract}


We consider the problem of minimizing a function \( f(x, y) \), where \( f \) is a smooth and strongly convex function with respect to both variables, being \( \mu_x \)-strongly convex in \( x \) and \( \mu_y \)-strongly convex in \( y \). The optimal accelerated gradient method of Yurii Nesterov achieves a convergence rate that requires approximately \( \mathcal{O}((\min(\mu_x, \mu_y))^{-1/2}) \) evaluations of the partial gradients \( \nabla_x f \) and \( \nabla_y f \). In this paper, we propose a novel optimization algorithm that improves upon this complexity by requiring only \( \mathcal{O}(\mu_x^{-1/2}) \) computations of \( \nabla_x f \) and \( \mathcal{O}(\mu_y^{-1/2}) \) computations of \( \nabla_y f \). This improvement is particularly advantageous in scenarios where there is a significant disparity between the strong convexity parameters, specifically when \( \mu_x \gg \mu_y \). Furthermore, in practical applications where the computation of \( \nabla_y f \) is considerably more efficient than that of \( \nabla_x f \), the proposed method leads to a substantial reduction in the overall wall-clock time required for optimization. As a key application, we consider Partially Local Federated Learning, a setting in which the model is partitioned into a local component and a global component. We demonstrate how our proposed method can be effectively applied in this framework, highlighting its practical advantages in improving computational efficiency.


%\textbf{Key words:} convex optimization, accelerated coordinate descent, splitting methods, Federated Learning, Machine Learning
\end{abstract}

\section{Introduction}

The development of optimal ("black-box") algorithms for fundamental classes of convex optimization problems dates back several decades \citep{nemirovski1983problem}. Contemporary research, however, often exploits the additional structural properties of optimization problems, effectively “looking inside the black box” \citep{nesterov2018lectures}. Many notable results in this direction focus on problems with a composite structure\footnote[2]{The function $F$ is $\mu$-strongly convex, while both $f$ and $g$ have Lipschitz-continuous gradients with constants $L_f$ and $L_g$, respectively.}, formulated as  
\begin{equation}
    \min_x F(x):= f(x) + g(x),
\end{equation}  
where the complexity of the problem can be "split" into two components: approximately $\sqrt{L_f/\mu}$ evaluations of $\nabla f$ and $\sqrt{L_g/\mu}$ evaluations of $\nabla g$ \citep{lan2016gradient,ivanova2022oracle,kovalev2022optimal}.  

However, there remains a significant gap in the literature regarding "optimal results" for the so-called \textit{min-min} problem:  
\begin{equation}\label{eq:based_problem}
    \min_{x,y} f(x,y),
\end{equation}  
where the smoothness constants, strong convexity parameters, and computational complexities of $\nabla_x f$ and $\nabla_y f$ can vary significantly between the variables $x$ and $y$, as well as in their respective dimensionalities.  

Such problems frequently arise in various applications, including transportation modeling, where they play a crucial role in combined trip distribution and assignment \citep{de2005solving,gasnikov2014three}, as well as in soft clustering \citep{nesterov2020soft}. A particularly relevant application in Machine Learning can be seen in the Yahoo! Click-Prediction model proposed by \citep{dvurechensky2022hyperfast}:  
\begin{align*}
    \min_{x\in \R^d} f(x)&: = \frac{1}{m}\sum_{k=1}^m \log\left(1 + \exp\left(-\eta^k\langle\xi^k,x\rangle\right)\right)\\
    &+ \lambda_S \sum_{i \in I_S} x_i^2 + \lambda_D \sum_{i \in I_D} x_i^2,
\end{align*}  
where $I_S\cup I_D = \left\{1,...,d \right\}$, $I_S\cap I_D = \varnothing$, with $|I_D|\gg |I_S|$ and $\lambda_S \gg \lambda_D$.  

In this context, it is natural to define $x:=\left\{x_i\right\}_{i\in I_S}$ and $y:=\left\{x_i\right\}_{i\in I_D}$ in the formulation of \eqref{eq:based_problem}, highlighting the distinct structural differences in optimization complexities between these two variable groups.  



\subsection{Problem Setup and Overview of Main Result}

In this paper, we consider the following class of optimization problems:  
\begin{equation}\label{eq:main}
	\min_{x \in \R^{d_x},y \in \R^{d_y}} f(x,y),
\end{equation}  
where \( f(x,y) \colon \sX \times \sY \to \R \) is a convex function satisfying the assumptions outlined below. We impose the following standard smoothness and strong convexity conditions on \( f(x, y) \):  


\begin{assumption}\label{ass:L}  
The function \( f(x,y) \) is \( (L_x, L_y) \)-smooth with constants \( L_x, L_y > 0 \). That is, for all \( x_1, x_2 \in \sX \) and \( y_1, y_2 \in \sY \), the following inequality holds:  
\begin{equation}
		\begin{split}
				f(x_2,y_2) &\leq f(x_1,y_1) + \<\nabla_x f(x_1,y_1),x_2-x_1>\\&  \quad+\<\nabla_y f(x_1,y_1),y_2-y_1> \\&\quad+ \frac{L_x}{2}\sqn{x_2-x_1} + \frac{L_y}{2}\sqn{y_2-y_1}.
		\end{split}
\end{equation}  
\end{assumption}  

This condition implies that the gradients \( \nabla_x f \) and \( \nabla_y f \) are Lipschitz continuous with constants \( L_x \) and \( L_y \), respectively, up to a factor of 2.  

\begin{assumption}\label{ass:mu}  
The function \( f(x,y) \) is \( (\mu_x, \mu_y) \)-strongly convex with constants \( \mu_x, \mu_y > 0 \). That is, for all \( x_1, x_2 \in \sX \) and \( y_1, y_2 \in \sY \), the function satisfies:  
\begin{equation}
		\begin{split}
			f(x_2,y_2) &\geq f(x_1,y_1) + \<\nabla_x f(x_1,y_1),x_2-x_1>\\
   &\quad+\<\nabla_y f(x_1,y_1),y_2-y_1> \\&\quad+ \frac{\mu_x}{2}\sqn{x_2-x_1} + \frac{\mu_y}{2}\sqn{y_2-y_1}.
		\end{split}
\end{equation}  
\end{assumption}  

This assumption ensures that \( f(x,y) \) exhibits strong convexity in both \( x \) and \( y \), which is crucial for achieving fast convergence rates using accelerated methods.  

 

The main contribution of this paper is the introduction of the Block Accelerated Method (BAM) (see Section~\ref{sec:Algorithm}), which efficiently solves problem \eqref{eq:main} to a relative precision \( \epsilon \) with the following computational complexity:  

\begin{center}
\framebox{\Longstack[c]{
\(\mathcal{O}\left(\sqrt{\frac{L_x}{\mu_x}}\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_x f \) \\ \\ 
and \\ \\ 
\(\mathcal{O}\left(\sqrt{\frac{L_y}{\mu_y}}\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_y f \).
}}
\end{center}  

These complexity bounds match the known lower bounds for strongly convex smooth optimization, as established in classical results by \citet{nemirovski1983problem} and \citet{nesterov2018lectures}. Therefore, our method is optimal in terms of the number of gradient evaluations required for solving \eqref{eq:main}.  

Moreover, when \( f(x, y) \) is convex but not strongly convex in one or both blocks, we can apply a regularization technique (see, e.g., \citet{gasnikov2016efficient}) to transform the problem into a strongly convex one. Specifically, by introducing a small regularization term, we can ensure strong convexity with a parameter of approximately \( \mu_{\circ} \sim \epsilon / R^2 \), where \( R \) is the Euclidean distance between the initial point and the closest optimal solution. This allows us to extend the applicability of our method to a broader class of convex problems.

\subsection{Related works}
% The close problem formulation considered when studied accelerated coordinate-descent methods \citep{nesterov2012efficiency,richtarik2014iteration,nesterov2017efficiency,ivanova2021adaptive}.\footnote{Note that the the first accelerated gradient schemes \citep{nesterov2012efficiency,richtarik2014iteration} do not allow to obtain this result. The first time this results was announced in \citep{nesterov2015polyak80} by using special coordinate-wise randomization: $p_x \sim \sqrt{L_x}$ and $p_y \sim \sqrt{L_y}$. Next it was developed at different works with different variations \citep{gasnikov2015accelerated,allen2016even,nesterov2017efficiency}.}

% However, the result is quite different: with probability $\ge 1 - \delta$
% \begin{center}
% $\cO\left(\sqrt{\frac{{L}_x}{\min\left\{\mu_x,\mu_y\right\} } }\log \frac{1}{\epsilon}\log \frac{1}{\delta}\right)$ calculations of $\nabla_x f$  and $\cO\left(\sqrt{\frac{{L}_y}{\min\left\{\mu_x,\mu_y\right\}}}\log \frac{1}{\epsilon}\log \frac{1}{\delta}\right)$ calculations of $\nabla_y f$,
% \end{center}
% where ${L}_x$ is Lipschitz constant of $\nabla_x f(x,y)$ as a function of $x$ and ${L}_y$ is Lipschitz constant of $\nabla_y f(x,y)$ as a function of $y$. 

% The same results, but with worse smoothness constants, hold for accelerated alternating methods \citep{beck2017first,diakonikolas2018alternating,guminov2021combination,tupitsa2021alternating}.

% Note that by using re-scaling of $y$ variables $y':=\sqrt{\mu_y/\mu_x} y$ it is possible to equalise the constants of strong convexity $\mu_x = \mu_{y'}$. Applying accelerated coordinate-descent from \citep{nesterov2017efficiency} to the re-scaled problem we can obtain in standard variables: 
% \begin{center}
% $\cO\left(\sqrt{\frac{{L}_x}{\mu_x} }\log \frac{1}{\epsilon}\log \frac{1}{\delta}\right)$ calculations of $\nabla_x f$  and $\cO\left(\sqrt{\frac{{L}_y}{\mu_y}}\log \frac{1}{\epsilon}\log \frac{1}{\delta}\right)$ calculations of $\nabla_y f$.
% \end{center}
% This result is close to our result, but our approach is deterministic that is better in $\log \frac{1}{\delta}$ times. Moreover, our results are based on very different ideas.  

% In the cycle of works  
% \citep{bolte2020ah,gladin2021solvingMinMin,gladin2021solving,ostroukhov2022tensor} it was proposed to solve \eqref{eq:main} an outer optimization problem in $x$ block with inexact gradient oracle determined by the solution of the inner problem in $y$ block:
% \begin{align*}
%     \min_x F(x):&= \min_y f(x,y),\\
%     \nabla F(x) &= \nabla_x f(x,y(x))\\
%     &= \frac{\partial f}{\partial x} \left(x,y\right)\Big|_{y=y(x)},
% \end{align*}
%     where $y(x)$ is determined as the solution of $\min_y f(x,y)$.


% The most practical results were obtained in the case when $x \in Q \subset \R^{d_x}$, where $d_x$ is small: 
% \begin{center}
% $\tilde{\cO}\left(d_x\log \frac{1}{\epsilon}\right)$ calculations of $\nabla_x f$  and $\tilde{\cO}\left(d_x\sqrt{\frac{L_y}{\min\left\{\mu_x,\mu_y\right\}}}\log^2 \frac{1}{\epsilon}\right)$ calculations of $\nabla_y f$.
% \end{center}
% Note that the known lower bound assumes: 
% \begin{center}
% $\cO\left(d_x\log \frac{1}{\epsilon}\right)$ calculations of $\nabla_x f$  and $\cO\left(\sqrt{\frac{L_y}{\min\left\{\mu_x,\mu_y\right\}}}\log \frac{1}{\epsilon}\right)$ calculations of $\nabla_y f$.
% \end{center}

\begin{algorithm*}[t!]
	\caption{\textbf{B}lock \textbf{A}ccelerated \textbf{M}ethod (BAM)}
	\begin{algorithmic}\label{alg:sliding}
		\STATE {\bf Parameters:} $\eta_x,\eta_y > 0$, $\theta_x, \theta_y > 0$, $\alpha \in (0,1)$ 
		\STATE {\bf Input:} $x^0 = \ol{x}^0 \in \sX$, $y^0 = \ol{y}^0$
		\FOR{$k=0,1,\ldots, K-1$}
			\STATE $\ul{x}^k = \alpha x^k + (1-\alpha)\ol{x}^k $
			\STATE $\ul{y}^k = \alpha y^k + (1-\alpha)\ol{y}^k$
			\STATE find $\ol{y}^{k+1}$ such that
			\begin{equation}\label{eq:ms}
				\norm{\nabla_y f(\ul x^k, \ol y^{k+1}) + (\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)} \leq (\eta_y\alpha)^{-1}\norm{\ol y^{k+1} - \ul y^k}.
			\end{equation}
			\STATE $\ol{x}^{k+1} = \ul x^k - \eta_x\alpha \nabla_x f(\ul x^k, \ol y^{k+1})$
			\STATE $x^{k+1} = x^k + \alpha(\ul{x}^k - x^{k+1}) - \eta_x \nabla_x f(\ul x^k, \ol y^{k+1})$
			\STATE $y^{k+1} = y^k + \alpha(\ol{y}^{k+1} - y^{k+1}) - \eta_y \nabla_y f(\ul x^k, \ol y^{k+1})$
		\ENDFOR
	\end{algorithmic}
\end{algorithm*}
 

The problem formulation we consider in this paper is closely related to those studied in the context of accelerated coordinate-descent methods \citep{nesterov2012efficiency, richtarik2014iteration, nesterov2017efficiency, ivanova2021adaptive}. However, while the formulation is similar, our results differ significantly.  

In particular, prior work on accelerated coordinate-descent methods has established that, with probability at least \(1 - \delta\), the complexity bounds for solving our problem \eqref{eq:main} using randomized coordinate-wise acceleration are:  
\begin{center}
    \(\mathcal{O}\left(\sqrt{\frac{L_x}{\min\left\{\mu_x,\mu_y\right\} } }\log \frac{1}{\epsilon} \log \frac{1}{\delta}\right)\) evaluations of \( \nabla_x f \)  
    \\
    and  
    \\
    \(\mathcal{O}\left(\sqrt{\frac{L_y}{\min\left\{\mu_x,\mu_y\right\}}}\log \frac{1}{\epsilon} \log \frac{1}{\delta}\right)\) evaluations of \( \nabla_y f \),  
\end{center}  
where \( L_x \) and \( L_y \) are the Lipschitz constants of \( \nabla_x f(x, y) \) with respect to \( x \) and \( \nabla_y f(x, y) \) with respect to \( y \), respectively.  

The first accelerated coordinate-descent methods \citep{nesterov2012efficiency, richtarik2014iteration} did not yield these results directly. The breakthrough came with \citep{nesterov2015polyak80}, which introduced a specialized coordinate-wise randomization scheme with probabilities \( p_x \sim \sqrt{L_x} \) and \( p_y \sim \sqrt{L_y} \). This approach was further developed in subsequent works, leading to various algorithmic refinements \citep{gasnikov2015accelerated, allen2016even, nesterov2017efficiency}.  

Similar complexity bounds, albeit with slightly worse smoothness constants, have also been derived for **accelerated alternating methods** \citep{beck2017first, diakonikolas2018alternating, guminov2021combination, tupitsa2021alternating}.  

% #### **Comparison via Re-Scaling of Variables**  

An alternative way to analyze the complexity of solving \eqref{eq:main} is through a variable re-scaling approach. Specifically, by introducing a re-scaled variable \( y' := \sqrt{\mu_y/\mu_x} y \), we can equalize the strong convexity constants such that \( \mu_x = \mu_{y'} \). Applying the accelerated coordinate-descent method from \citep{nesterov2017efficiency} to the re-scaled problem, we obtain the following complexity bounds in the original variables:  
\begin{center}
    \(\mathcal{O}\left(\sqrt{\frac{L_x}{\mu_x} }\log \frac{1}{\epsilon} \log \frac{1}{\delta}\right)\) evaluations of \( \nabla_x f \)  
    \\
    and  
    \\
    \(\mathcal{O}\left(\sqrt{\frac{L_y}{\mu_y}}\log \frac{1}{\epsilon} \log \frac{1}{\delta}\right)\) evaluations of \( \nabla_y f \).  
\end{center}  

These results are closely related to our findings, but our approach provides an important advantage: our method is fully deterministic, which eliminates the additional logarithmic dependence on \( \delta \) present in randomized methods. Moreover, our derivation is based on fundamentally different theoretical principles, further distinguishing our work from prior research.  

% ### **Outer-Inner Optimization Methods**  

An alternative line of research approaches problem \eqref{eq:main} using a nested optimization framework, where the outer optimization is performed over \( x \), and the inner problem in \( y \) is solved approximately to provide an inexact gradient oracle. This methodology has been explored in a series of works \citep{bolte2020ah, gladin2021solvingMinMin, gladin2021solving, ostroukhov2022tensor}, where the objective function is reformulated as:  
\begin{align*}
    \min_x F(x):&= \min_y f(x,y),\\
    \nabla F(x) &= \nabla_x f(x,y(x))\\
    &= \frac{\partial f}{\partial x} \left(x,y\right)\Big|_{y=y(x)},
\end{align*}
where \( y(x) \) is defined as the solution to the inner minimization problem \( \min_y f(x,y) \).  

% #### **Complexity Bounds in the Low-Dimensional Case**  

The most practical results in this framework have been obtained for problems where \( x \) belongs to a low-dimensional set \( Q \subset \mathbb{R}^{d_x} \), where \( d_x \) is relatively small. In this case, the established complexity bounds are:  
\begin{center}
    \(\tilde{\mathcal{O}}\left(d_x\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_x f \)  
    \\
    and  
    \\
    \(\tilde{\mathcal{O}}\left(d_x\sqrt{\frac{L_y}{\min\left\{\mu_x,\mu_y\right\}}}\log^2 \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_y f \).  
\end{center}  

Interestingly, the known lower bounds for this setting suggest:  
\begin{center}
    \(\mathcal{O}\left(d_x\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_x f \)  
    \\
    and  
    \\
    \(\mathcal{O}\left(\sqrt{\frac{L_y}{\min\left\{\mu_x,\mu_y\right\}}}\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_y f \).  
\end{center}  
However, it remains unclear whether this lower bound is tight, leaving room for potential improvements in future research.  

% #### **Generalization to Alternative Inner Problem Oracles**  

This nested optimization framework has also been extended to scenarios involving various types of inner problem oracles: Gradient-free approaches \citep{gladin2021solving}, Randomized variance-reduced methods \citep{gladin2021solvingMinMin}, Higher-order tensor methods \citep{ostroukhov2022tensor}  

Despite these advances, the performance of these methods deteriorates significantly when \( d_x \) is large. In this case, the outer method must be accelerated, leading to complexity bounds of:  
\begin{center}
    \(\mathcal{O}\left(\sqrt{\frac{L_x}{\mu_x}}\log \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_x f \)  
    \\
    and  
    \\
    \(\mathcal{O}\left(\sqrt{\frac{L_x L_y}{\mu_x\mu_y}}\log^2 \frac{1}{\epsilon}\right)\) evaluations of \( \nabla_y f \).  
\end{center}  

This bound is significantly worse in terms of the number of \( \nabla_y f \) evaluations compared to our method.  

To summarize, prior to our work, no known deterministic optimization algorithm could achieve an independent complexity bound for each block without sacrificing theoretical guarantees. Our method provides the first fully deterministic approach that effectively decouples the complexities into separate terms for \( x \) and \( y \), achieving an optimal rate without requiring coordinate-wise randomization or nested optimization frameworks.  



The removal of the logarithmic factor in our analysis constitutes a substantial theoretical contribution. Eliminating stochasticity—and thus the associated logarithmic overhead—not only simplifies the analysis but also enhances the stability of the method by removing the need to average over multiple runs to estimate convergence behavior, a common requirement in stochastic settings. Our proof technique diverges significantly from standard analyses of accelerated coordinate methods that rely on coordinate sampling, thereby advancing the theoretical foundations for such methods. Historically, the elimination of logarithmic factors has marked several key breakthroughs in optimization. For instance, Accelerated Gradient Descent [1], while often seen as a novel application of momentum, can also be interpreted as eliminating logarithmic terms from the complexity of the conjugate gradient method [2]. Katyusha [3], a milestone in stochastic optimization, achieved direct acceleration with variance reduction, essentially refining the log-dependent Catalyst framework [4]. Similar log-factor removals underpinned major advances in online learning [5–7] and resolved a long-standing problem in the multi-armed bandit setting [8]. These precedents underscore the theoretical depth and practical value of eliminating such terms, highlighting the significance of our contribution.




\section{Main Algorithm}\label{sec:Algorithm}



The development of the Block Accelerated Method (BAM) was influenced by a series of recent advancements in optimization, particularly those presented by \citet{kovalev2022optimal,kovalev2022first,kovalev2022first_high} (see also \citep{ivanova2021adaptive,gasnikov2021accelerated,carmon2022recapp}). These works leverage inner-loop acceleration techniques, akin to catalyst-type methods, to derive optimal accelerated algorithms for saddle-point problems and high-order optimization methods.  

However, it is important to emphasize that BAM represents a fundamentally different approach. While previous methods primarily focus on achieving optimal acceleration through nested iterations or high-order techniques, BAM is explicitly designed to decouple the complexities associated with different variable blocks. This distinction is crucial because splitting the computational burden into independent complexity bounds for each block is nontrivial and requires a novel algorithmic framework. Unlike existing methods that rely on uniform acceleration across all variables, BAM introduces a tailored acceleration mechanism that optimally balances the computational effort required for different blocks, ensuring efficiency without resorting to coordinate-wise randomization or nested optimization schemes.

Let us provide a detailed description of the BAM method. The first step involves computing convex combinations for both coordinate blocks. This operation can be interpreted as a form of momentum, which plays a central role in achieving acceleration: $$ \begin{aligned} & \underline{x}^k = \alpha x^k + (1 - \alpha) \bar{x}^k, \newline & \underline{y}^k = \alpha y^k + (1 - \alpha) \bar{y}^k. \end{aligned} $$ Next, we solve a subproblem to ensure the following condition is satisfied: \begin{align*}
     &\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right) + \left(\eta_y \alpha\right)^{-1}\left(\bar{y}^{k+1} - \underline{y}^k\right) \right\Vert\\
     &\leq \left(\eta_y \alpha\right)^{-1} \left\Vert \bar{y}^{k+1} - \underline{y}^k \right\Vert. \end{align*} This step is crucial for separating the complexity of different components and for enabling acceleration; it is also essential for the theoretical analysis. Subsequently, the method performs a gradient step on the server block: \begin{align*} \bar{x}^{k+1} = \underline{x}^k - \eta_x \alpha \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right). \end{align*} Finally, the algorithm updates both coordinate blocks using gradient steps that incorporate the difference between iterations. This mechanism is another key component contributing to acceleration: \begin{align*} & x^{k+1} = x^k + \alpha \left(\underline{x}^k - x^{k+1} \right) - \eta_x \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), \\ & y^{k+1} = y^k + \alpha \left(\bar{y}^{k+1} - y^{k+1} \right) - \eta_y \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right). \end{align*} 

The theoretical guarantees and complexity bounds established in this work are fundamentally dependent on a key technical result, which we formalize in the core lemma below. 

\begin{algorithm*}
	\caption{\textbf{O}ptimized \textbf{G}radient \textbf{M}ethod (OGM-G)}
	\begin{algorithmic}\label{alg:OGM-G}
		\STATE {\bf Parameters:} stepsize $\gamma$, matrix $\Tilde{\theta}_i$:
 \STATE $\tilde{\theta}_i= \begin{cases}\frac{1+\sqrt{1+8 \tilde{\theta}_{i+1}^2}}{2}, & i=0, \\ \frac{1+\sqrt{1+4 \tilde{\theta}_{i+1}^2}}{2}, & i=1, \ldots, N-1, \\ 1, & i=N,\end{cases}$
		\STATE {\bf Input:} $x^0 = y^0 \in \mathbb{R}^d$
		\FOR{$i=0,1,\ldots, N-1$}
			\STATE $y_{i+1}=x_i-\gamma\nabla f\left(x_i\right) $
			\STATE $x_{i+1}=y_{i+1}+\frac{\left(\tilde{\theta}_i-1\right)\left(2 \tilde{\theta}_{i+1}-1\right)}{\tilde{\theta}_i\left(2 \tilde{\theta}_i-1\right)}\left(y_{i+1}-y_i\right)+\frac{2 \tilde{\theta}_{i+1}-1}{2 \tilde{\theta}_i-1}\left(y_{i+1}-x_i\right)$
		\ENDFOR
	\end{algorithmic}
\end{algorithm*}

\begin{lemma}\label{lem:descent}
	Let $\eta_x$ satisfy
	$
		\eta_x \leq (\alpha L_x)^{-1}.
	$
	Then, the following inequality holds:
	\begin{align}
	    		-f(\ul x^{k}, \ol y^{k+1}) &\leq - f(\ol x^{k+1}, \ol y^{k+1})\\\notag
       &\quad-\frac{\eta_x\alpha}{2} \sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}.
	\end{align}


\end{lemma}
We now formally present our main theoretical result in the theorem stated below. This theorem encapsulates the core contribution of our work, providing a rigorous statement of the achieved complexity bounds and demonstrating the effectiveness of the proposed algorithm.

\begin{theorem}\label{thm:sliding}
	Let $\cR_x^k = \sqn{x^k - x^*}$, $\cR_y^k = \sqn{y^k - y^*}$.
	Let $\Psi^k$ be the following Lyapunov function:
	\begin{align}
		\Psi^k &= (1+\alpha)\left(\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}\right)\\\notag
		&\quad+
		\frac{2}{\alpha}\left( f(\ol x^{k}, \ol y^{k}) - f(x^*,y^*)\right).
	\end{align}
	Let parameters $\eta_x,\eta_y,\alpha$ be defined as follows:
	\begin{equation}
		\alpha = \sqrt{\frac{\mu_x}{L_x}}, \qquad \eta_x = \frac{1}{\sqrt{\mu_x  L_x}}, \qquad \eta_y =\frac{1}{\mu_y}\sqrt{ \frac{\mu_x}{L_x}}.
	\end{equation} 
	Then, iterations of \Cref{alg:sliding} satisfy the following inequality:
	\begin{equation}
		\Psi^{k+1} \leq (1+\alpha)^{-1}\Psi^k.
	\end{equation}
\end{theorem}
\section{Inner Algorithm}

We define the auxiliary function \( A^k(y) \colon \sY \to \mathbb{R} \) as follows:
\begin{equation}
	A^k(y) = f(\ul x^k, y) + \frac{1}{2\eta_y\alpha}\sqn{y - \ul y^k}.
\end{equation}
Then, the condition in \eqref{eq:ms} from \Cref{alg:sliding} can be equivalently written as:
\begin{equation}\label{eq:ms2}
	\norm{\nabla A^k (\ol y^{k+1})} \leq (\eta_y\alpha)^{-1}\norm{\ol y^{k+1} - \ul y^k}.
\end{equation}
To find \( \ol{y}^{k+1} \) that satisfies this condition, we apply an optimal algorithm for gradient norm reduction \citep{diakonikolas2022potential, kim2021optimizing} to the minimization problem:
\begin{equation}\label{eq:aux}
	\min_{y \in \sY} A^k(y).
\end{equation}

The following theorem, taken from Remark 1 of \citep{nesterov2021primal}, applies to this setup.

\begin{theorem}
There exists an algorithm that, when applied to problem \eqref{eq:aux} with starting point \( \ul{y}^k \), produces \( \ol{y}^{k+1} \) satisfying:
\begin{equation}
\label{eq:inner}
\| \nabla A^k(\ol{y}^{k+1}) \| \leq \frac{C \max\{L_y, (\eta_y \alpha)^{-1}\} \| \ul{y}^k - \ul{y}^* \|}{T^2},
\end{equation}
where \( T \) is the number of calls to \( \nabla A^k(y) \), \( \ul{y}^* = \argmin_{y \in \sY} A^k(y) \), and \( C > 0 \) is a universal constant.
\end{theorem}

\begin{corollary}\label{cor:inner}
To output \( \ol{y}^{k+1} \) that satisfies condition \eqref{eq:ms2}, the inner algorithm requires the following number of iterations:
\begin{equation}
\label{eq:T}    
T = \sqrt{2C} \max \left\{ 1, \sqrt{\eta_y \alpha L_y} \right\}.
\end{equation}
\end{corollary}

A simple approach to achieve the optimal rate \( \mathcal{O}\left(\frac{1}{T^2}\right) \) for gradient norm reduction under the initial distance condition involves running Nesterov Accelerated Gradient for the first \( N/2 \) iterations and then applying the OGM-G algorithm (Algorithm \ref{alg:OGM-G}) for the remaining \( N/2 \) iterations.

The OGM-G algorithm utilizes a triangular matrix \( \tilde{\theta}_i \), which determines coefficients for the iterations. The first step of the algorithm is a gradient step, while the second step is an acceleration step using previous points and the coefficients \( \tilde{\theta}_i \).



\section{Total Complexity}

Let us now formulate and summarize the key results, followed by an analysis of the total computational complexity.

From \Cref{thm:sliding}, we can conclude that to find an \( \epsilon \)-accurate solution to problem~\eqref{eq:main}, \Cref{alg:sliding} requires the following number of calls to \( \nabla_x f(x,y) \):
\begin{equation}
	K = \cO\left(\sqrt{\frac{L_x}{\mu_x}}\log \frac{1}{\epsilon}\right).
\end{equation}
Additionally, \Cref{cor:inner}, in conjunction with the parameter choices in \Cref{alg:sliding} as derived from \Cref{thm:sliding}, implies that the number of inner iterations is:
\begin{equation}
    \begin{split}
        	T &= \cO\left(\max\{1, \sqrt{\eta_y\alpha L_y}\}\right)\\
 &=\cO\left(\max\left\{1, \sqrt{\frac{L_y\mu_x}{L_x\mu_y}}\right\}\right).
    \end{split}
\end{equation}

Thus, the total number of calls to \( \nabla_y f(x,y) \) is:
\begin{align*}
	K\times T
	&=
	\cO\left(\sqrt{\frac{L_x}{\mu_x}}\log \frac{1}{\epsilon}\right) \times  \cO\left(\max\left\{1, \sqrt{\frac{L_y\mu_x}{L_x\mu_y}}\right\}\right)
	\\&=\cO\left(\max\left\{\sqrt{\frac{L_x}{\mu_x}},\sqrt{\frac{L_y}{\mu_y}}\right\}\log \frac{1}{\epsilon}\right)
    \end{align*}
This expression provides a concise description of the total complexity required for solving the problem to \( \epsilon \)-accuracy, considering the number of gradient evaluations in both blocks.



\section{Federated Learning application}

\subsection{Collaborative learning}

Federated learning is a robust machine learning paradigm in which multiple clients (or workers) collaborate to train a shared model in a distributed environment, while ensuring that the clients' local data remains private~\citep{mcmahan2017communication}. This privacy is critical, as it allows for training on sensitive or proprietary data without the need to share it across participants. Typically, the data is distributed across numerous clients, and communication occurs only with a central server in the centralized regime~\citep{konevcny2016federated}. In contrast, in the decentralized regime~\citep{koloskova2020unified}, clients interact based on a predefined communication graph, without relying on a central coordinator or server, enabling more flexible communication architectures. A key example of federated learning is in developing machine learning models for applications such as text prediction in mobile keyboards, where sensitive user data (such as typed text) is never shared between clients or with the server, maintaining privacy.

Federated learning is deployed in a variety of settings, including both cross-device and cross-silo environments. In cross-device settings, such as mobile devices or IoT devices, data is typically distributed across a large number of individual devices, and federated learning allows for the creation of global models without transferring sensitive data~\citep{hard2018federated}. In cross-silo settings, such as corporate or institutional collaborations, data is distributed across a smaller number of entities (e.g., hospitals or banks), where federated learning facilitates model training across organizations while ensuring privacy and compliance with regulations~\citep{rieke2020future}.

In standard federated learning approaches, a single global model is trained using local updates from clients. One of the most commonly used algorithms is FedAvg~\citep{khaled2020tighter,woodworth2020local}, which reduces communication costs—typically the major bottleneck in federated learning—by allowing clients to perform several local gradient steps before sending their updates to the central server for aggregation. While this approach helps reduce communication frequency, it suffers from poor convergence guarantees in the presence of data heterogeneity, especially when no additional assumptions about data similarity are made. To overcome these limitations, several enhanced methods have been proposed~\citep{karimireddy2020scaffold,mitra2021linear,gorbunov2021local}, which achieve linear convergence rates in deterministic settings. However, despite these improvements, the communication complexity of these methods still does not outperform vanilla gradient descent (GD) because of the small step sizes required in the analysis.

In a more recent advancement~\citep{mishchenko2022proxskip}, it was shown that incorporating local steps into the training process can indeed accelerate communication, offering a promising approach for improving the efficiency of federated learning. This has been further developed in subsequent works that extend this mechanism to various problem settings~\citep{malinovsky2022variance,grudzien2022can,condat2022provably}. These studies provide valuable insights into how local optimization strategies can complement global model aggregation, thereby enhancing the overall communication efficiency without sacrificing convergence speed.




However, global model training can be prohibited in some settings even without sharing data due to privacy constraints. For example, using client-specific embeddings can reveal user identity, which is not allowed by a privacy policy. In order to fix this issue, a concept of partial federated learning was introduced~\citep{singhal2021federated}. In this approach, models have two blocks of parameters: global block $x$ and local blocks $y_i$, which never leave the clients. This technique enables to have interpolation between distributed and non-distributed learning. Partial federated learning is closely connected to personalizing and meta-learning algorithms. The most popular meta-learning algorithm is MAML~\citep{finn2017model}, and connection to federated learning was established in several works~\citep{nichol2018first, chen2018federated, fallah2020personalized}. 



\subsection{Federated Reconstruction}

Let us describe the baseline of partial federated learning known as Federated Reconstruction~\citep{singhal2021federated}. In this framework, there are two blocks of coordinates: user-specific parameters \(y_i\) and non-user-specific parameters \(x\). During each communication round, the server sends the global part of the parameters \(x_t\) to all clients. Each client then reconstructs its local parameters \(y_t^i\) using the current global model \(x_t\). This reconstruction process generally requires several steps. Once the local model is restored, each client updates its copy of the global parameters and sends only the updated copies back to the server. The server then aggregates these updates and generates the next iterate \(x_{t+1}\).

\begin{figure*}[t!]
	\centering
    	\label{ris:image2}
	\begin{tabular}{ccc}
        \includegraphics[width=0.31\linewidth]{log_001-00-1-upd_x_mm.pdf}&
		\includegraphics[width=0.31\linewidth]{log_001-000-1-upd_x_m.pdf}&	
		\includegraphics[width=0.31\linewidth]{log_001-0000-5-upd_x.pdf}\\
  	    \includegraphics[width=0.31\linewidth]{log_001-00-1-upd_y_mm.pdf}&	
        \includegraphics[width=0.31\linewidth]{log_001-000-1-upd_y_m.pdf}&	
		\includegraphics[width=0.31\linewidth]{log_001-0000-5-upd_y.pdf}
	\end{tabular}
	\centering
	\caption{Comparison of Block Accelerated Method (BAM), Nesterov Accelerated Method (NAG), Accelerated Coordinate Descent Method (ACDM), and Linear Coupling method (LinCoupling) on logistic regression loss functions with two different $l_2$ regularizers. The first line represents the rate in terms of the $\nabla_x f(x,y)$ oracle calls, and the second one represents the rate in terms of the $\nabla_y f(x,y)$ oracle calls. We set $\mu_y = 0.002$ (left column), $\mu_y = 0.0001$ (middle column) and $\mu_y = 0.00005$ (right column). }

\end{figure*}



The newly proposed BAM algorithm can be extended to minimize \(f(x, y_1, \dots, y_M)\) in a distributed setting and can be applied to Federated Reconstruction~\citep{singhal2021federated}. Since the communication complexity depends on the number of calls to \(\nabla_x f(x, y_1, \dots, y_M)\), the communication complexity of this method is \(\mathcal{O}\left( \sqrt{\frac{L_x}{\mu_x}} \log \frac{1}{\varepsilon} \right)\). This bottleneck in communication can be alleviated when the condition number of the local parameters is small. Furthermore, this communication complexity is optimal.

We now elaborate on how to apply the Block Alternating Minimization (BAM) method in a distributed setting under the Federated Reconstruction framework. We consider a federated system with $n$ clients and the following objective function:

$$ f(x, y_1, \ldots, y_n) = \frac{1}{n} \sum_{i=1}^{n} f_i(x, y_i), $$

where $ x $ denotes the global (server-specific) model block, and $ y_i $ represents the local block associated with client $ i $. Each local loss function $ f_i $ depends only on the global variable $ x $ and the corresponding local variable $ y_i $.



We now describe the Federated BAM algorithm. At the beginning of each communication round, the server computes the extrapolated global model: 
$$ \underline{x}^k = \alpha x^k + (1 - \alpha) \bar{x}^k. $$

The server then broadcasts $ \underline{x}^k $ to all clients. Upon receiving this model, each client computes the extrapolated local model: 
$$ \underline{y}_i^k = \alpha y_i^k + (1 - \alpha) \bar{y}_i^k. $$

Each client then solves a local subproblem of the form (6) to find $ \bar{y}_i^{k+1} $ such that



\begin{align*}
 &\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}_i^{k+1}\right) + (\eta_y \alpha)^{-1} \left(\bar{y}_i^{k+1} - \underline{y}_i^k\right) \right\Vert \\ &\leq (\eta_y \alpha)^{-1} \left\Vert \bar{y}_i^{k+1} - \underline{y}_i^k \right\Vert. 
\end{align*}


\begin{figure*}[t!]
	\centering
	\begin{tabular}{ccc}
		\includegraphics[width=0.31\linewidth]{10_100_x_E1_50_500.pdf}&	\includegraphics[width=0.31\linewidth]{10_100_x_E1_50_5000.pdf}&
		\includegraphics[width=0.31\linewidth]{10_100_x_E1_50_50000.pdf}\\
  		\includegraphics[width=0.31\linewidth]{10_100_y_E1_50_500.pdf}&	\includegraphics[width=0.31\linewidth]{10_100_y_E1_50_5000.pdf}&
		\includegraphics[width=0.31\linewidth]{10_100_y_E1_50_50000.pdf}
	\end{tabular}
	\centering
	\caption{Comparison of Block Accelerated Method (BAM), Nesterov Accelerated Method (NAG), Accelerated Coordinate Descent Method (ACDM) and Linear Coupling method (LinCoupling) on quadratic functions. First line represents rate in terms of the $\nabla_x f(x,y)$ oracle calls and the second one represents rate in terms of the $\nabla_y f(x,y)$ oracle calls. We set $L_y = 500$ (left column), $L_y = 5000$ (middle column) and $L_y = 50000$ (right column).  }
	\label{ris:image1}
\end{figure*}
After solving this subproblem, the client updates its local variable as follows:

$$ y_i^{k+1} = y_i^k + \alpha \left( \bar{y}_i^{k+1} - y_i^k \right) - \eta_y \nabla_y f\left(\underline{x}^k, \bar{y}_i^{k+1}\right). $$

Each client also computes the gradient with respect to the global variable:
$ \nabla_x f_i\left(\underline{x}^k, \bar{y}_i^{k+1}\right), $
and sends it to the server. The server aggregates these gradients to compute the full global gradient:
$$ \nabla_x f\left(\underline{x}^k, \bar{y}_1^{k+1}, \ldots, \bar{y}_n^{k+1}\right) = \frac{1}{n} \sum_{i=1}^n \nabla_x f_i\left(\underline{x}^k, \bar{y}_i^{k+1}\right). $$
Using the aggregated gradient, the server updates the global model as follows:
 \begin{align*} \bar{x}^{k+1} &= \underline{x}^k - \eta_x \alpha \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), \\ x^{k+1} &= x^k + \alpha \left( \underline{x}^k - x^k \right) - \eta_x \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right). \end{align*} 

The process repeats until convergence, with local parameters $y_i$ staying on clients and only the global parameter $x$ shared with the server.

Experimental results for partial federated learning can be found in \citet{singhal2021federated, mishchenko2023partially}. A detailed study of the practical application of BAM to Partial Personalized Federated Learning with deep learning models is left for future work.






\section{Experiments}
In all of our experiments, we compare the proposed Block Accelerated Method (BAM) with several well-established optimization methods to assess its performance and effectiveness. These methods include the Nesterov Accelerated Method (NAG) \citep{nesterov1983method}, which is a classical approach for smooth convex optimization, the Accelerated Coordinate Descent Method (ACDM) \citep{nesterov2017efficiency}, known for its efficiency in coordinate-wise optimization, and the Linear Coupling Method (LinCoupling) \citep{allen2016even, gasnikov2015accelerated}, which provides a framework for optimizing coupled problems. 



\subsection{Quadratic objectives}
In our experiments, we begin by considering quadratic functions of the form:
$$
f(z) = z^\top A z + b^\top z,
$$
where \( z = (x, y)^\top \) represents a joint vector consisting of two blocks: \( x \) and \( y \). The matrix spectrum is uniformly generated for each block, with eigenvalues for the block \( x \) ranging from \( \mu_x \) to \( L_x \), and eigenvalues for the block \( y \) ranging from \( \mu_y \) to \( L_y \). For this setup, we set \( \mu_x = \mu_y = 0.1 \), and \( L_y = 50 \). The dimensions of the blocks are set to \( d_x = 100 \) for \( x \) and \( d_y = 10 \) for \( y \). 

To analyze the impact of varying condition numbers, we adjust the parameter \( L_y \) to generate different values of the condition number \( \kappa_y \). Throughout the experiments, we focus on comparing the number of oracle calls for \( \nabla_x f(x,y) \) and \( \nabla_y f(x,y) \) across several optimization methods. This allows us to evaluate the efficiency of each method under controlled settings.
\subsection{Logistic regression}
In our experiments, we also investigate the logistic regression loss function with two \( l_2 \) regularizers for a click-prediction model, defined as:

\begin{align*}
f(x,y)&: = \frac{1}{n}\sum_{k=1}^n \log\left(1 + \exp\left(-\eta^k\langle\xi^k,(x,y)\rangle\right)\right)\\
    &+ \lambda_x \|x\|^2 + \lambda_y \|y\|^2.
\end{align*}

For this experiment, we used the "a1a" dataset from the LIBSVM collection~\citep{CC01a}. The datasets analyzed in this study are available in the LIBSVM repository. The smoothness constant for this dataset is estimated as \( L = 1.567 \). We set \( d_x = 100 \), \( d_y = 19 \), and \( \mu_x = 0.01 \). To explore condition numbers, we vary the parameter \( \mu_y \). Also, we consider the number of oracle calls to \( \nabla_x f(x, y) \) and \( \nabla_y f(x, y) \) for comparison across different methods.

\subsection{Results}
In our experiments, as illustrated in the plots, the new Block Accelerated Method (BAM) demonstrates superior performance in terms of the number of \(\nabla_x f(x, y)\) oracle calls for both objective functions across all tested condition numbers. This indicates that the new method is more efficient in terms of computational resources for these oracle calls. Additionally, all accelerated coordinate methods outperform the Nesterov Gradient Method (NAG) by a significant margin, which serves to validate the theoretical bounds established for these methods. 



When considering \(\nabla_y f(x, y)\) oracle calls, the performance of BAM is approximately the same as that of other accelerated coordinate methods and the Nesterov Gradient Method. This shows that BAM does not incur a performance penalty when evaluating \(\nabla_y f(x, y)\). In scenarios where oracle calls to \(\nabla_x f(x, y)\) are particularly costly, BAM can be particularly advantageous due to its reduced communication complexity. Furthermore, the method's ability to be generalized to distributed and federated settings further enhances its practical utility, suggesting that BAM has significant potential for practical applications.

\section{Discussion}
In this paper, we address a convex optimization problem with a min-min structure:

$$ \min_{x,y} f(x,y). $$

Under the assumption that \( f \) is \( L \)-smooth and \(\mu_x\)-strongly convex in \( x \), and \(\mu_y\)-strongly convex in \( y \), we propose a new algorithm, BAM, which requires \( \mathcal{O}\left(\sqrt{L/\mu_x}\log\frac{1}{\epsilon}\right) \) calculations of \( \nabla_x f \) and \( \mathcal{O}\left(\sqrt{L/\mu_y}\log\frac{1}{\epsilon}\right) \) calculations of \( \nabla_y f \) to achieve an \(\epsilon\)-accurate solution. Furthermore, we demonstrate the applicability of BAM to Federated Learning, showing its potential to reduce communication costs while maintaining high efficiency in decentralized settings. 

The approach proposed in this paper offers several possibilities for further generalizations. For instance, it can be adapted to mixed oracles, as introduced in \citep{gladin2021solving}, where instead of computing \( \nabla_y f \), only the function value \( f(x, y) \) is available. Another possible extension is increasing the number of blocks in the optimization problem (currently, we consider only two blocks, \( x \) and \( y \)) for more complex scenarios. Additionally, BAM can be combined with other techniques, such as composite sliding methods \citep{lan2016gradient, kovalev2022optimal}, which were mentioned at the outset of the introduction. These possible extensions present promising directions for future research and could lead to further improvements in efficiency and applicability across various domains. Furthermore, the proposed method opens up new avenues for exploring optimization in large-scale distributed systems, where the challenges of data heterogeneity and communication constraints are critical.












\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work of A. Gasnikov was supported by the Ministry of Economic Development of the RF (code 25-139-66879-1-0003).


\end{acknowledgements}

% References
\bibliography{example_paper}

\newpage

\onecolumn

\title{An Optimal Algorithm for Strongly Convex Min-Min Optimization\\(Supplementary Material)}
\maketitle





\appendix
\vspace{3mm}









\section{Proofs}
\begin{proof}[Proof of \Cref{lem:descent}]
	Using \cref{ass:L}, we get
	\begin{align*}
		f(\ol x^{k+1}, \ol y^{k+1})
		&\leq
		f(\ul x^{k}, \ol y^{k+1}) + \<\nabla_x f(\ul x^{k}, \ol y^{k+1}), \ol x^{k+1} - \ul x^k> + \frac{L_x}{2}\sqn{\ol x^{k+1} - \ul x^k}
		\\&=
		f(\ul x^{k}, \ol y^{k+1}) + \eta_x\alpha \left( \frac{\eta_x\alpha L_x}{2} - 1\right)\sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}
		\\&\leq
		f(\ul x^{k}, \ol y^{k+1}) + \eta_x\alpha \left( \frac{1}{2} - 1\right)\sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}
		\\&\leq
		f(\ul x^{k}, \ol y^{k+1}) - \frac{\eta_x\alpha}{2} \sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}
	\end{align*}
\end{proof}

\begin{proof}[Proof of \Cref{thm:sliding}]


% \section*{Supplementary Material: Proof Details (Page 12–13)}

We start our derivation of upper bound from considering the following term: $\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}$. Using definition of $\mathcal{R}_x^{k+1}$ and $\mathcal{R}_y^{k+1}$ in Theorem 2.2 we have $$ \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} = \eta_x^{-1}\Vert x^{k+1} - x^* \Vert^2+\eta_y^{-1}\Vert y^{k+1} - y^* \Vert^2. $$ Let us consider the squared norm of difference $\Vert x^{k+1} - x^* \Vert^2:$ 

\begin{align*}
\Vert x^{k+1} - x^\star\Vert^2 &= \Vert x^{k+1} -x^k +x^k - x^\star\Vert^2 \\ 
& = \Vert x^{k+1} - x^k\Vert^2 + 2<x^{k+1} -x^k, x^k - x^\star> + \Vert x^k - x^\star\Vert^2\\
& = \Vert x^{k+1} - x^k\Vert^2 + 2<x^{k+1} -x^k, x^k - x^{k+1} + x^{k+1} - x^\star> + \Vert x^k - x^\star\Vert^2\\
& = \Vert x^{k+1} - x^k\Vert^2 - 2<x^{k+1} -x^k, x^{k+1} - x^k >+2<x^{k+1} -x^k, x^{k+1} - x^\star> + \Vert x^k - x^\star\Vert^2\\
& = -\Vert x^{k+1} - x^k\Vert^2 + 2<x^{k+1} -x^k, x^{k+1} - x^\star> + \Vert x^k - x^\star\Vert^2.
\end{align*}

Similarly, we have \begin{align*} \Vert y^{k+1} - y^\star\Vert^2 = -\Vert y^{k+1} - y^k\Vert^2 + 2<y^{k+1} -y^k, y^{k+1} - y^\star> + \Vert y^k - y^\star\Vert^2. \end{align*}

Combining these equations together we obtain

\begin{align*} \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} & =\eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
& +2 \eta_x^{-1}\left\langle x^{k+1}-x^k, x^{k+1}-x^*\right\rangle+2 \eta_y^{-1}\left\langle y^{k+1}-y^k, y^{k+1}-y^*\right\rangle. \end{align*}

Next, we recall that the update rules for new iterates are the following:  \begin{align*} & x^{k+1}=x^k+\alpha\left(\underline{x}^k-x^{k+1}\right)-\eta_x \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right) \\ & y^{k+1}=y^k+\alpha\left(\bar{y}^{k+1}-y^{k+1}\right)-\eta_y \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right). \end{align*} 

We can extract the difference between iterates:
\begin{align*} & x^{k+1} - x^k=\alpha\left(\underline{x}^k-x^{k+1}\right)-\eta_x \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right) \ & y^{k+1} - y^k=\alpha\left(\bar{y}^{k+1}-y^{k+1}\right)-\eta_y \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right). \end{align*}
 
 Now we can plug these identities into previous our main equation and obtain \begin{align*} \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} & =\eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
 & +2 \eta_x^{-1} \alpha\left\langle\underline{x}_k-x^{k+1}, x^{k+1}-x^*\right\rangle+2 \eta_y^{-1}\left\langle\bar{y}^{k+1}-y^{k+1}, y^{k+1}-y^*\right\rangle \\
 & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^*\right\rangle. \end{align*}
 
 Next, we need to use standard algebraic trick: $$2 <(a - b), (b - c)> = \Vert a - c \Vert^2 - \Vert b - c\Vert^2 - \Vert b - a\Vert^2.$$ We can quickly proof this statement:
 
  \begin{align*} \Vert a - c \Vert^2 - \Vert b - c\Vert^2 - \Vert b - a\Vert^2 & = \Vert a \Vert^2 - 2<a,c> +\Vert c \Vert^2\\
  &- \left(\Vert b \Vert^2 - 2<b,c> +\Vert c \Vert^2\right) - \left(\Vert b \Vert^2 - 2<a,b> +\Vert a \Vert^2\right) \\ & = - 2<a,c> + 2<b,c>+2<a,b> - 2<b,b> \\ & = -2<a-b,c>+2<a-b,b> = 2<a-b, b-c> \end{align*}.
  
  We apply this identity to our main equation for $2 \eta_x^{-1} \alpha\left\langle\underline{x}_k-x^{k+1}, x^{k+1}-x^*\right\rangle+2 \eta_y^{-1}\left\langle\bar{y}^{k+1}-y^{k+1}, y^{k+1}-y^*\right\rangle$ and obtain the following: 
  \begin{align*} \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} &=\eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert ^2 \\
  & +\eta_x^{-1} \alpha\left(\left\Vert\underline{x}^k-x^*\right\Vert ^2-\left\Vert x^{k+1}-x^*\right\Vert^2-\left\Vert x^{k+1}-\underline{x}^k\right\Vert^2\right) \\
  & +\eta_y^{-1} \alpha\left(\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2-\left\Vert y^{k+1}-y^*\right\Vert^2-\left\Vert y^{k+1}-\bar{y}^{k+1}\right\Vert^2\right) \\
  & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^*\right\rangle.
  \end{align*}
  
  Since the norm of vector is nonnegative, then we have $A - \Vert b \Vert \leq A$, so we can get rid of $-\left\Vert x^{k+1}-\underline{x}^k\right\Vert^2$ and $-\left\Vert y^{k+1}-\bar{y}^{k+1}\right\Vert^2$: 
  
 \begin{align*} \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} &\leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
 & +\eta_x^{-1} \alpha\left(\left\Vert \underline{x}^k-x^*\right\Vert^2-\left\Vert x^{k+1}-x^*\right\Vert^2\right) \\
 & +\eta_y^{-1} \alpha\left(\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2-\left\Vert y^{k+1}-y^*\right\Vert^2\right) \\
 & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^*\right\rangle . 
 \end{align*}

 Starting from previous bound on $\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}$ and let us open the brackets: 
 
 \begin{align*} \eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1} &\leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
 & +\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2- \eta_x^{-1} \alpha \left\Vert x^{k+1}-x^*\right\Vert^2 \\
 & +\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2-\eta_y^{-1} \alpha\left\Vert y^{k+1}-y^*\right\Vert^2 \\
 & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^*\right\rangle . 
 \end{align*} 
 
 Next, we put $- \eta_x^{-1} \alpha \left\Vert x^{k+1}-x^*\right\Vert^2$ and $-\eta_y^{-1} \alpha\left\Vert y^{k+1}-y^*\right\Vert^2 $ to the left side, and this leads to 
 
 \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) & \leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
 & +\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
 & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^*\right\rangle. \end{align*}
 
 Next we add and subtract vectors $x^k$ and $y^k$ in inner products and use the following identity: $-2<a,b-c> -2<a,c-d> = 2<a,b-d>$, so we have 
 
  \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) &\leq\eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
  & +\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
  & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^{k+1}-x^k\right\rangle-2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-x^*\right\rangle \\
  & -2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^{k+1}-y^k\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-y^*\right\rangle. \end{align*}
  
  Next, we need to apply Young's inequality for inner products (also known as the Peter–Paul inequality): $$<a,b> \leq \frac{\Vert a \Vert^2 }{2c_1} + \frac{\Vert b \Vert^2c_1 }{2}.$$
  
  If we use $a^\prime = -a$, then we also have
$$-<a,b> \leq \frac{\Vert a \Vert^2 }{2c_1} + \frac{\Vert b \Vert^2c_1 }{2}.$$ We apply this inequality and obtain

 \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) & \leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k-\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2-\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2 \\
 & +\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
 & +\eta_x^{-1}\left\Vert x^{k+1}-x^k\right\Vert^2+\eta_x\left\Vert \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2-2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-x^*\right\rangle \\
 & +\eta_y^{-1}\left\Vert y^{k+1}-y^k\right\Vert^2+\eta_y\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-y^*\right\rangle.
 \end{align*}
 
 Note that $\eta_x^{-1}\left\Vert x^{k+1} - x^k \right\Vert^2$ and $\eta_y^{-1}\left\Vert y^{k+1} - y^k \right\Vert^2$ cancel out, since we also have the terms $-\eta_x^{-1}\left\Vert x^{k+1} - x^k \right\Vert^2$ and $-\eta_y^{-1}\left\Vert y^{k+1} - y^k \right\Vert^2$, so we have 
 
  \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) & \leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k+\eta_x^{-1} \alpha\left\Vert\underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
  & +\eta_x\left\Vert \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2+\eta_y\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2 \\
  & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-y^*\right\rangle. \end{align*}
  
  Next, we need to add and subtract vectors $\underline{x}^k$ and $\bar{y}^{k+1}$ in inner products, so we have 
  
   \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) &\leq\eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k+\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
   & +\eta_x\left\Vert \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2+\eta_y\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2 \\
   & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), \underline{x}^k-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), \bar{y}^{k+1}-y^*\right\rangle \\
   & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-\underline{x}^k\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-\bar{y}^{k+1}\right\rangle. \end{align*}
   
   Now we are ready to apply strong convexity Assumption~\ref{ass:mu} specifically for terms $ -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), \underline{x}^k-x^*\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), \bar{y}^{k+1}-y^*\right\rangle $. This allows us to obtain the following inequality:
   
    \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) & \leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k+\eta_x^{-1} \alpha\left\Vert \underline{x}^k-x^*\right\Vert^2+\eta_y^{-1} \alpha\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
    & +\eta_x\left\Vert \nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2+\eta_y\left\Vert \nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2 \\ 
    & +2\left(f\left(x^, y^*\right)-f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right)-\mu_x\left\Vert \underline{x}^k-x^*\right\Vert^2-\mu_y\left\Vert \bar{y}^{k+1}-y^*\right\Vert^2 \\
    & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-\underline{x}^k\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-\bar{y}^{k+1}\right\rangle. \end{align*} 
   
   After rearranging terms we obtain 
   
 \begin{align*} (1+\alpha)\left(\eta_x^{-1} \mathcal{R}_x^{k+1}+\eta_y^{-1} \mathcal{R}_y^{k+1}\right) & \leq \eta_x^{-1} \mathcal{R}_x^k+\eta_y^{-1} \mathcal{R}_y^k+\left(\eta_x^{-1} \alpha-\mu_x\right)\left\Vert \underline{x}^k-x^* \right\Vert^2+\left(\eta_y^{-1} \alpha-\mu_y\right)\left\Vert\bar{y}^{k+1}-y^*\right\Vert^2 \\
 & +\eta_x\left\Vert\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2+\eta_y\left\Vert\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right\Vert^2+2\left(f\left(x^*, y^*\right)-f\left(\underline{x}^k, \bar{y}^{k+1}\right)\right) \\
 & -2\left\langle\nabla_x f\left(\underline{x}^k, \bar{y}^{k+1}\right), x^k-\underline{x}^k\right\rangle-2\left\langle\nabla_y f\left(\underline{x}^k, \bar{y}^{k+1}\right), y^k-\bar{y}^{k+1}\right\rangle. \end{align*}  

 Next, we use the update rules: 			 $\ul{x}^k = \alpha x^k + (1-\alpha)\ol{x}^k$ and $\ul{y}^k = \alpha y^k + (1-\alpha)\ol{y}^k$. From these lines we derive $x^k = \frac{\underline{x}^k - (1-\alpha)\bar{x}^k}{\alpha}$ and $y^k = \frac{\underline{y}^k - (1-\alpha)\bar{y}^k}{\alpha}$ and obtain 
       \begin{align*}
			(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)	&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
		\\&
		+\frac{2(1-\alpha)}{\alpha}\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k>
		\\&
		-\<\nabla_y f(\ul x^k, \ol y^{k+1}),\frac{2}{\alpha}(\ul y^k - (1-\alpha)\ol y^k)- 2\ol y^{k+1}>
		\\&=
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
		\\&
		+\frac{2(1-\alpha)}{\alpha}\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k>
		\\&
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k + (1-\alpha)\ol y^k -(1-\alpha)\ol y^{k+1}>
		\\&=
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k> + \<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^k -\ol y^{k+1}>\right)
		\\&
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
	\end{align*}
Next, we use convexity of $f(x,y)$ and apply for the term $\left(\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k> + \<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^k -\ol y^{k+1}>\right)$:
\begin{align*}
		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(\ul x^k, \ol y^{k+1})\right)
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
\end{align*}
After reshuffling of terms we obtain
\begin{align*}
	(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)	\\&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		\\&
		+2 f(x^*, y^*)
		+\frac{2(1-\alpha)}{\alpha}f(\ol x^k, \ol y^k)
		-\frac{2}{\alpha}f(\ul x^k, \ol y^{k+1})
		\\&
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
	\end{align*}

    	Using \Cref{lem:descent}, we obtain
	\begin{align*}
		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		\\&
		+2 f(x^*, y^*)
		+\frac{2(1-\alpha)}{\alpha}f(\ol x^k, \ol y^k)
		\\&
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) + \frac{\eta_x\alpha}{2} \sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}\right)
		\\&
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
		\end{align*}
Since $2 f(x^*, y^*) = \frac{2}{\alpha}f(x^*, y^*) - \frac{2(1-\alpha)}{\alpha}f(x^*, y^*)$ we have 

        \begin{align*}
		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
		\\&
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
        \end{align*}
Next, we use $\frac{\eta_y}{\eta_y}=1$ and obtain
\begin{align*}
				(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
		\\&
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+2\eta_y\<\nabla_y f(\ul x^k, \ol y^{k+1}),(\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)>.
\end{align*}
Using the fact that $2\left\langle a,b \right\rangle = \|a+b\|^2 - \|a\|^2 - \|b\|^2$ we get
\begin{align*}
				(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
		\\&
		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		+\eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1}) + (\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)}
		\\&
		-\eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
		-\eta_y^{-1}\alpha^{-2}\sqn{\ol y^{k+1} - \ul y^k}.
        \end{align*}
        After rearranging terms we get 
        \begin{align*}
				(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
		\\&
		+\eta_y\left(\sqn{\nabla_y f(\ul x^k, \ol y^{k+1}) + (\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)} - (\eta_y\alpha)^{-2}\sqn{\ol y^{k+1} - \ul y^k}\right).
	\end{align*}

Using inequality \eqref{eq:ms}, we get
	\begin{align*}
		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
		\\&
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right).
	\end{align*}
	Using the choice of parameters $\eta_x,\eta_y,\alpha$, we get
	\begin{align*}
		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		\\&
		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right).
	\end{align*}
	After rearranging, we get
	\begin{align*}
		\Psi^{k+1}
		&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		\\&\leq
		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
		+\frac{2(1+\alpha)^{-1}}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
		\\&=
		(1+\alpha)^{-1} \Psi^k.
	\end{align*}

\end{proof}


\begin{proof}[Proof of \Cref{cor:inner}]
	Using inequality~\eqref{eq:inner} and \eqref{eq:T}, we get
	\begin{align*}
		\norm{\nabla A^k(\ol y^{k+1})}
		&\leq
		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ul y^k- \argmin_{y \in \sY} A^k(y)}
		\\&\leq
		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \ul y^k}
		+\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \argmin_{y \in \sY} A^k(y)}.
	\end{align*}
	Function $A^k(y)$ is $(\eta_y\alpha)^{-1}$-strongly convex which implies
	\begin{equation}
		(\eta_y\alpha)^{-1}\norm{\ol y^{k+1} - \argmin_{y \in \sY} A^k(y)} \leq \norm{\nabla A^k(\ol y^{k+1}) - \nabla A^k(\argmin_{y \in \sY} A^k(y))} = \norm{\nabla A^k(\ol y^{k+1})}.
	\end{equation}
	Hence, 
	\begin{align*}
		\norm{\nabla A^k(\ol y^{k+1})}
		&\leq
		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \ul y^k}
		+\frac{1}{2}\norm{\nabla A^k(\ol y^{k+1})}.
	\end{align*}
	Rearranging concludes the proof.
\end{proof}



























% ====================================
% \begin{proof}[Proof of \Cref{thm:sliding}]
% 	\begin{align*}
% 		\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}
% 		&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+2\eta_x^{-1}\<x^{k+1} - x^k,x^{k+1} - x^*>
% 		+2\eta_y^{-1}\<y^{k+1} - y^k,y^{k+1} - y^*>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+2\eta_x^{-1}\alpha\<\ul x_k - x^{k+1},x^{k+1} - x^*>
% 		+2\eta_y^{-1}\<\ol{y}^{k+1} - y^{k+1},y^{k+1} - y^*>
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^{k+1} - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^{k+1} - y^*>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+\eta_x^{-1}\alpha\left(\sqn{\ul x^k - x^*} - \sqn{x^{k+1} - x^*} - \sqn{x^{k+1} - \ul x^k}\right)
% 		\\&
% 		+\eta_y^{-1}\alpha\left(\sqn{\ol y^{k+1} - y^*} - \sqn{y^{k+1} - y^*} - \sqn{y^{k+1} - \ol y^{k+1}}\right)
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^{k+1} - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^{k+1} - y^*>
% 		\\&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+\eta_x^{-1}\alpha\left(\sqn{\ul x^k - x^*} - \sqn{x^{k+1} - x^*} \right)
% 		\\&
% 		+\eta_y^{-1}\alpha\left(\sqn{\ol y^{k+1} - y^*} - \sqn{y^{k+1} - y^*} \right)
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^{k+1} - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^{k+1} - y^*>.
% 	\end{align*}
% 	This implies
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^{k+1} - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^{k+1} - y^*>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^{k+1} - x^k>
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - x^*>
% 		\\&
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^{k+1} - y^k>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - y^*>.
% 	\end{align*}
% 	Using Young's inequality, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		-\eta_x^{-1}\sqn{x^{k+1} - x^k}
% 		-\eta_y^{-1}\sqn{y^{k+1} - y^k}
% 		\\&
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\eta_x^{-1}\sqn{x^{k+1} - x^k} + \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - x^*>
% 		\\&
% 		+\eta_y^{-1}\sqn{y^{k+1} - y^k} + \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - y^*>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - y^*>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ul x^k - x^*>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1} - y^*>
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - \ul x^k>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - \ol y^{k+1}>
% 	\end{align*}
% 	Using \Cref{ass:mu}, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\eta_x^{-1}\alpha\sqn{\ul x^k - x^*} 
% 		+\eta_y^{-1}\alpha\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		\\&
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) - \mu_x \sqn{\ul x^k - x^*} - \mu_y \sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - \ul x^k>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - \ol y^{k+1}>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
% 		\\&
% 		-2\<\nabla_x f(\ul x^k, \ol y^{k+1}),x^k - \ul x^k>
% 		-2\<\nabla_y f(\ul x^k, \ol y^{k+1}),y^k - \ol y^{k+1}>
%         \end{align*}



        
%         \begin{align*}
% 			(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)	&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k>
% 		\\&
% 		-\<\nabla_y f(\ul x^k, \ol y^{k+1}),\frac{2}{\alpha}(\ul y^k - (1-\alpha)\ol y^k)- \ol y^{k+1}>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k>
% 		\\&
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k + (1-\alpha)\ol y^k -(1-\alpha)\ol y^{k+1}>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(\<\nabla_x f(\ul x^k, \ol y^{k+1}),\ol x^k - \ul x^k> + \<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^k -\ol y^{k+1}>\right)
% 		\\&
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
% 	\end{align*}
% 	Using convexity of $f(x,y)$, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\left(f(x^*, y^*) - f(\ul x^k, \ol y^{k+1})\right) 
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(\ul x^k, \ol y^{k+1})\right)
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		\\&
% 		+2 f(x^*, y^*)
% 		+\frac{2(1-\alpha)}{\alpha}f(\ol x^k, \ol y^k)
% 		-\frac{2}{\alpha}f(\ul x^k, \ol y^{k+1})
% 		\\&
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>.
% 	\end{align*}
% 	Using \Cref{lem:descent}, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+ \eta_x\sqn{\nabla_x f(\ul x^k, \ol y^{k+1})}
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		\\&
% 		+2 f(x^*, y^*)
% 		+\frac{2(1-\alpha)}{\alpha}f(\ol x^k, \ol y^k)
% 		\\&
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) + \frac{\eta_x\alpha}{2} \sqn{\nabla_x f(\ul x^{k}, \ol y^{k+1})}\right)
% 		\\&
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
% 		\\&
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+\frac{2}{\alpha}\<\nabla_y f(\ul x^k, \ol y^{k+1}),\ol y^{k+1}-\ul y^k>
%         \end{align*}
%         \begin{align*}
% 				(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
% 		\\&
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+2\eta_y\<\nabla_y f(\ul x^k, \ol y^{k+1}),(\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)>
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
% 		\\&
% 		+ \eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		+\eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1}) + (\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)}
% 		\\&
% 		-\eta_y\sqn{\nabla_y f(\ul x^k, \ol y^{k+1})}
% 		-\eta_y^{-1}\alpha^{-2}\sqn{\ol y^{k+1} - \ul y^k}
% 		\\&=
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right)
% 		\\&
% 		+\eta_y\left(\sqn{\nabla_y f(\ul x^k, \ol y^{k+1}) + (\eta_y\alpha)^{-1}(\ol y^{k+1}-\ul y^k)} - (\eta_y\alpha)^{-2}\sqn{\ol y^{k+1} - \ul y^k}\right).
% 	\end{align*}
% 	Using inequality \eqref{eq:ms}, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+(\eta_x^{-1}\alpha - \mu_x)\sqn{\ul x^k - x^*} 
% 		+(\eta_y^{-1}\alpha - \mu_y)\sqn{\ol y^{k+1} - y^*}
% 		\\&
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right).
% 	\end{align*}
% 	Using the choice of parameters $\eta_x,\eta_y,\alpha$, we get
% 	\begin{align*}
% 		(1+\alpha)\left(\eta_x^{-1}\cR_x^{k+1} + \eta_y^{-1}\cR_y^{k+1}\right)
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		\\&
% 		-\frac{2}{\alpha}\left( f(\ol x^{k+1}, \ol y^{k+1}) - f(x^*,y^*)\right).
% 	\end{align*}
% 	After rearranging, we get
% 	\begin{align*}
% 		\Psi^{k+1}
% 		&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\frac{2(1-\alpha)}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		\\&\leq
% 		\eta_x^{-1}\cR_x^{k} + \eta_y^{-1}\cR_y^{k}
% 		+\frac{2(1+\alpha)^{-1}}{\alpha}\left(f(\ol x^k, \ol y^k) - f(x^*,y^*)\right)
% 		\\&=
% 		(1+\alpha)^{-1} \Psi^k.
% 	\end{align*}

% \end{proof}


% \begin{proof}[Proof of \Cref{cor:inner}]
% 	Using inequality~\eqref{eq:inner} and \eqref{eq:T}, we get
% 	\begin{align*}
% 		\norm{\nabla A^k(\ol y^{k+1})}
% 		&\leq
% 		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ul y^k- \argmin_{y \in \sY} A^k(y)}
% 		\\&\leq
% 		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \ul y^k}
% 		+\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \argmin_{y \in \sY} A^k(y)}.
% 	\end{align*}
% 	Function $A^k(y)$ is $(\eta_y\alpha)^{-1}$-strongly convex which implies
% 	\begin{equation}
% 		(\eta_y\alpha)^{-1}\norm{\ol y^{k+1} - \argmin_{y \in \sY} A^k(y)} \leq \norm{\nabla A^k(\ol y^{k+1}) - \nabla A^k(\argmin_{y \in \sY} A^k(y))} = \norm{\nabla A^k(\ol y^{k+1})}.
% 	\end{equation}
% 	Hence, 
% 	\begin{align*}
% 		\norm{\nabla A^k(\ol y^{k+1})}
% 		&\leq
% 		\frac{(\eta_y\alpha)^{-1}}{2}\norm{\ol y^{k+1} - \ul y^k}
% 		+\frac{1}{2}\norm{\nabla A^k(\ol y^{k+1})}.
% 	\end{align*}
% 	Rearranging concludes the proof.
% \end{proof}


\end{document}
