\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
                        

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{ELF: Federated Langevin Algorithms with Primal, Dual and Bidirectional Compression}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Avetik Karagulyan}
\author[2]{Peter Richtárik}
% Add affiliations after the authors
\affil[1]{%
    CNRS, CentraleSupélec, Université Paris-Saclay, Laboratoire des Signaux et Systèmes, France
}
\affil[2]{%
    King Abdullah University of Science and Technology, Saudi Arabia
}



\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{subfigure}
\usepackage[titletoc,toc,page]{appendix}
\usepackage{minitoc}


\usepackage{algorithm}
\usepackage{algorithmic}
  

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{graphicx}
\usepackage{booktabs} % for professional tables


\usepackage{hyperref}

%%% more pleasant colors
\usepackage{xcolor,colortbl}
\definecolor{midnightblue}{HTML}{0059b3}
\definecolor{darkmidnightblue}{HTML}{154c84}
\definecolor{noonblue}{HTML}{e5eef7}
\definecolor{chromered}{HTML}{f14233}
\definecolor{darkgreen}{HTML}{0e6029}

\hypersetup{ 
colorlinks=true,
linkcolor = darkmidnightblue,
citecolor = darkgreen,
urlcolor=midnightblue           % color of external links
}



% Attempt to make hyperref and algorithmic work together better:
% \newcommand{\theHalgorithm}{\arabic{algorithm}}


% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
% \usepackage[capitalize,noabbrev]{cleveref}
% \usepackage[noabbrev]{cleveref}
\usepackage{cleveref}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{fact}[theorem]{Fact}
\newcounter{ban}
\newtheorem{assumption}[ban]{Assumption}

\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\usepackage[colorinlistoftodos,bordercolor=orange,backgroundcolor=orange!20,linecolor=orange,textsize=scriptsize]{todonotes}

\newcommand{\prnote}[1]{\todo[color=cyan!]{#1}}
\newcommand{\aknote}[1]{\todo[color=olivedrab!,textcolor=white]{#1}}

\newcommand{\peter}[1]{\todo[inline]{\textbf{Peter:} #1}}
%\newcommand{\peter}[1]{\todo[inline]{{\textbf{Peter:} \emph{#1}}}}
\newcommand{\Avetik}[1]{\todo[inline]{\textbf{Avetik:} #1}}

\newcommand{\rmd}{{\mathrm d}}
\newcommand{\rmP}{{\mathrm P}}
\newcommand{\rmJ}{{\mathrm J}}
\newcommand{\rmT}{{\mathrm T}}
\newcommand{\rmQ}{{\mathrm Q}}
\newcommand{\cF}{{\mathcal F}}
\newcommand{\cD}{{\mathcal D}}
\newcommand{\mcG}{{\mathscr G}}
\newcommand{\cB}{{\mathcal B}}
\newcommand{\cO}{{\mathcal O}}
\newcommand{\cG}{{\mathcal G}}
\newcommand{\cN}{{\mathcal N}}
\newcommand{\cL}{{\mathcal L}}
\newcommand{\cP}{{\mathcal P}}
\newcommand{\cH}{{\mathcal H}}
\newcommand{\cS}{{\mathcal S}}   
\newcommand{\bG}{{\mathbf G}} 
\newcommand{\bB}{{\mathbf B}} 
\newcommand{\bT}{{\mathbf T}} 
\newcommand{\sfX}{{\mathsf X}} 
\newcommand{\esp}{{\epsilon}} 
\newcommand{\Norm}[1]{\left\|#1\right\|_{H}}
\newcommand{\sqN}[1]{\Norm{#1}^2}
%\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\btheta}{\boldsymbol\theta}
\newcommand{\bvartheta}{\boldsymbol\vartheta}


\newcommand{\dom}{{dom}} 
\newcommand{\KSD}{\mathop{\mathrm{KSD}}\nolimits}
\newcommand{\Unif}{\mathop{\mathrm{Unif}}\nolimits}
\newcommand{\HS}{\mathop{\mathrm{HS}}\nolimits}
\newcommand{\op}{\mathop{\mathrm{op}}\nolimits}
\newcommand{\tr}{\mathop{\mathrm{tr}}\nolimits}
\newcommand{\var}{\mathop{\mathrm{var}}\nolimits}
\newcommand{\ps}[1]{\langle #1 \rangle}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\KL}[1]{H_{\pi}\left( #1\right)}
\newcommand{\FS}[1]{J_{\pi}\left( #1\right)}
\newcommand{\Exp}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\dif}[1]{\frac{\rmd #1}{\rmd t}}


\newcommand{\Stein}{\rm Stein}

\newcommand{\bs}{\boldsymbol}
\newcommand{\bsB}{\boldsymbol B}
\newcommand{\bH}{\mathbf H}
\newcommand{\bsD}{\boldsymbol D}
\newcommand{\bfD}{\mathbf D}
\newcommand{\bfC}{\mathbf C}
\newcommand{\bfZ}{\mathbf Z}
\newcommand{\bfE}{\mathbf E}
\newcommand{\bfP}{\mathbf P}
\newcommand{\bfQ}{\mathbf Q}
\newcommand{\bfN}{\mathbf N}
\newcommand{\bfSigma}{\mathbf\Sigma}
\newcommand{\bfH}{\mathbf H}
\newcommand{\bfA}{\mathbf A}
\newcommand{\bfR}{\mathbf R}
\newcommand{\bfS}{\mathbf S}
\newcommand{\bfM}{\mathbf M}
\newcommand{\bL}{\boldsymbol L}
\newcommand{\bA}{\boldsymbol A}
\newcommand{\bC}{\boldsymbol C}
\newcommand{\bZ}{\boldsymbol Z}
\newcommand{\bW}{\boldsymbol W}
\newcommand{\bV}{\boldsymbol V}
\newcommand{\bh}{\boldsymbol h}
\newcommand{\bx}{\boldsymbol x}
\newcommand{\bu}{\boldsymbol u}
\newcommand{\bv}{\boldsymbol v}
\newcommand{\bg}{\boldsymbol g}
\newcommand{\bX}{\boldsymbol X}
\newcommand{\bxi}{\boldsymbol\xi}
\newcommand{\bmu}{\boldsymbol\mu}
\newcommand{\NN}{\mathbb N}
\newcommand{\RR}{\mathbb R}
\newcommand{\EE}{\mathbb E}
\newcommand{\BB}{\mathbb B}
\newcommand{\CC}{\mathbb C}
\newcommand{\LL}{\mathbb L}
\newcommand{\Cov}{\textbf{Cov}}
\newcommand{\Var}{\textbf{Var}}




\newcommand\bfI{\mathbf I}




\newcommand{\bomega}{\text{\boldmath$\omega$}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\norm}[1]{\left\| #1 \right\|}
\newcommand{\normsq}[1]{\left\| #1 \right\|^2}
\newcommand{\inner}[2]{\left< #1 , #2 \right>}

\newcommand{\brr}[1]{\left( #1 \right)}   % brackets round
\newcommand{\brs}[1]{\left[ #1 \right]}  % brackets square
\newcommand{\brc}[1]{\left\{ #1 \right\}} % brackets curly

\newcommand{\algname}[1]{{\sf #1}}

  \begin{document}
\maketitle



  \begin{abstract}
    Federated sampling algorithms have recently gained great popularity in the community of machine learning and statistics. 
    This paper proposes a new federated sampling algorithm called Error Feedback Langevin algorithms (ELF). 
    In particular, we analyze the combinations of  EF21 and EF21-P with the federated Langevin Monte-Carlo. 
    We propose three algorithms, P-ELF, D-ELF, and B-ELF, that use  primal, dual, and bidirectional compressors. 
    We analyze the proposed methods under Log-Sobolev inequality and provide non-asymptotic convergence guarantees. 
    Simple experimental results support our theoretical findings.


  \end{abstract}
  % \addtocontents{toc}{\protect\setcounter{tocdepth}{0}}
\section{Introduction}

  Sampling from high-dimensional distributions holds immense significance in modern statistics and machine learning. 
  This problem is particularly relevant in Bayesian inference \citep{robert2007bayesian}, where sampling from high-dimensional posterior distributions is the bottleneck. 
  This work will focus specifically on sampling from posteriors that arise in Bayesian federated learning \citep{kassab2020federated,pmlr-vono22a-fed-lmc,liu2022wireless}. 

  Federated learning is a machine learning framework that assumes data is distributed across different devices/clients, with a central server coordinating them. 
  This scenario commonly arises in mobile applications, where each device possesses data and maintains a (limited) internet connection with the server \citep{konevcny2016federated,mcmahan2017communication-efficient}. 
  Consequently, the communication complexity becomes the computational bottleneck in most cases. 
  The objective is to train a global model by performing local updates while minimizing the information communicated.

More formally, we want to sample from a target distribution $\pi$, defined on the Euclidean space $\mathbb{R}^d$ and is absolutely continuous with respect to the Lebesgue measure. 
For convenience, we will use $\pi$ to refer to both the target distribution and its density function, given by:
\begin{equation}\label{eq:density}
\pi(x) \propto \exp(-F(x)),
\end{equation}
where $F:\mathbb{R}^d \rightarrow \mathbb{R}$ is called the potential function. 
In particular, when solving a Bayesian inference problem, $F$ corresponds to the negative log-likelihood.
In the federated setting, the potential function is assumed to be sum-decomposable, with each component stored on one of the clients or nodes/devices:
\begin{equation*}
F(x) = \frac{1}{n} \sum_{i=1}^n F_i(x),
\end{equation*}
where $n$ is the number of nodes and $F_i(x)$ represents the potential function of the $i$-th node. Each node only has access to its respective score, the gradient $\nabla F_i(x)$.

Building upon this framework, we propose three sampling algorithms that combine Langevin Monte Carlo (LMC) with well-known federated optimization techniques called EF21 \citep{richtarik2021ef21} and EF21-P \citep{sasha_kaja_EF-21P}. The algorithms are as follows:
\begin{itemize}
%\vspace{-.2cm}
\item D-ELF: LMC with dual compression (\Cref{sec:uplink});
%\vspace{-.1cm}
\item P-ELF: LMC with primal compression (\Cref{sec:downlink});
%\vspace{-.1cm}
\item B-ELF: LMC with bidirectional compression (\Cref{sec:bidir}).
%\vspace{-.2cm}
\end{itemize}

The first algorithm, D-ELF, focuses on client-to-server (uplink) compression to reduce communication complexity. Early papers of federated learning, such as \citep{konevcny2016federated} assumerd that the uplink communication is more costly than server-to-client communication. 
However, more recent reports\footnote{\url{https://www.speedtest.net/global-index}}, indicate that the difference between uploading and downloading speeds is negligible \citep{philippenko2020bidirectional}. 
As a result, downlink compression becomes equally important.
The second algorithm, P-ELF, adopts the EF21 scheme for the primal space, applying compression to the server-to-client (downlink) communication \citep{sasha_kaja_EF-21P}. This approach leverages compression in the direction opposite to the traditional uplink compression.
The third algorithm, B-ELF, combines uplink and downlink compression, hence the term "bidirectional." 
In the frequentist setting, bidirectional federated learning has been explored by several authors \citet{liu2020double,philippenko2020bidirectional,sasha_kaja_EF-21P}. 
However, this setting has not yet been extensively developed and studied for sampling problems. 
In this work, we analyze the first federated sampling algorithm incorporating bidirectional compression.
  

  \subsection{Langevin sampling}

  Langevin Monte-Carlo is one of the most common methods of sampling. 
  It is based on discretizing a stochastic differential equation (SDE) called Langevin diffusion (LD). 
  The latter is formulated as follows:
  \begin{equation*}
    \rmd L_t = - \nabla F(L_t)\rmd t + \sqrt{2}\rmd B_t,
      % L_t = L_0  - \int_0^t \nabla F(L_s)\rmd s + \sqrt{2t} Z,
  \end{equation*}
  where $B_t$ is the Brownian motion and $F$ is the potential function from \eqref{eq:density}. 
  The critical property of this SDE is that it has a solution and is ergodic under mild conditions. 
  Moreover, the target $\pi$ is its invariant distribution \citep{bhattacharya1978criteria}.
  Let us now define by $\rho_t$ the density of $L_t$. 
  Then, the evolution of $\rho_t$ is characterized by the Fokker-Planck equation corresponding to LD 
  \citep{pavliotis2014stochastic,risken1996fokker}:
  \begin{equation*}
    \frac{\partial \rho_t(x)}{\partial t} =  \nabla\cdot \brr{F(x) \rho_t(x)} + \Delta \rho_t(x).
  \end{equation*}
  Using the chain rule in the Fokker-Planck equation, one can verify that $\pi$ is indeed the stationary distribution for the Langevin diffusion.

  Langevin Monte-Carlo (LMC) is the Euler-Maruyama discretization of the Langevin diffusion \citep{parisi1981correlation}. 
  That is,
  \begin{equation}\label{eq:lmc}
    x_{k+1} = x_k - \gamma \nabla F(x_k) + \sqrt{2\gamma} Z_k,
  \end{equation}
  where $(Z_k)_k$ is a sequence of i.i.d. standard Gaussians on $\RR^d$ that are independent of previous iterations. 
  If the gradient of the potential (score) function is Lipschitz continuous, and the target satisfies the Log-Sobolev inequality, then the distribution of the $K$-th iterate converges to $\pi$ 
  \citep{vempala2019rapid}. 
  See \Cref{sec:related_work} for more context on the LMC.

  \subsection{EF21 and EF21-P}

  The Error Feedback algorithm first appeared in a heuristic manner in the paper by \citet{seide20141}.
  It was proposed as a stabilization mechanism for supervised learning using contractive compressors.
  Later, \citet{alistarh2018convergence,stich2018sparsified} analyzed the method theoretically. 
  Nevertheless, the initial EF has issues. Namely, it does not generalize to the distributed setting, which is crucial to federated learning, and the convergence analysis requires unrealistic assumptions, such as bounds on the gradient norm.
  See also Section 2 of \citet{horvath2020better} for more details on the shortcomings of the Error Feedback method.
  The EF21 (Error Feedback 21) algorithm modifies the original EF proposed by \citet{richtarik2021ef21}.
  The method proposes Markov compressors and uses them to compress gradient differences before communicating them to the server.
  It solves the above issues, and in particular, it applies to the distributed setting.
  The method is state of the art in theory and practice amongst error feedback mechanisms \citep{fatkhullin2021ef21}.

  Interestingly, theoretical guarantees on EF21 are rather conservative. Compared with other methods, it does not gain in terms of communication complexity. However, simple experiments show that EF21 beats all the other FL methods, hinting that the worst-case analysis is not informative in this case. 
  We refer the reader to \Cref{sec:uplink} for the exact definition and mathematical details of the EF21.

  EF21-P is a primal error-feedback method largely inspired by EF21. 
  The method is essentially the analog of EF21 on the primal space.
  Contrary to the dominating approach in federated learning \citep{konevcny2016federated,stich2018sparsified,mishchenko2019distributed,richtarik2021ef21,fatkhullin2021ef21}, it performs compression on iterates of the algorithm rather than their gradients.
  Hence, it reduces the complexity of downlink communication. 
  In general, efficient server-to-client compression may play a key role when the model is extremely large \citep{dean2012large,brown2020language}. 
  Furthermore, according to \citet{sasha_kaja_EF-21P}, EF21-P can also be viewed as an iteration perturbation method. 
  These methods are used in various settings in machine learning, including generalization \citep{orvieto2022anticorrelated} and smoothing \citep{duchi2012randomized}. 
  For the complete definition of the method, see \Cref{sec:downlink}.

  
  \subsection{Related work}\label{sec:related_work}

  \paragraph{Langevin Monte-Carlo}
  In their seminal work, \citet{roberts1996exponential} investigated the convergence properties of the Langevin Monte-Carlo (LMC) algorithm and found that a bias occurs when discretizing the continuous SDE. 
  This bias leads to the stationary distribution of the generated homogeneous Markov chain differing from the target distribution $\pi$. 
  To address this issue, \citet{roberts1996exponential} proposed a Metropolis-Hastings adjustment step at each iteration of the LMC, resulting in the Metropolis Adjusted Langevin Algorithm (MALA) \citep{roberts1998optimal,roberts2002langevin,xifara2014langevin,dwivedi2018log}.
   The bias of LMC depends on the discretization step size $\gamma$, and  \citet{dalalyan2017theoretical} proved a bound on this error. 
   Later, several researchers studied different properties of LMC \citep{durmus2017nonasymptotic,cheng2018sharp,cheng2018convergence,dalalyan2019user,durmus2019high,vempala2019rapid}.

   \paragraph{Connecting LMC and SGD}
  The LMC algorithm can be viewed as an instance of stochastic gradient descent (SGD) with independent Gaussian noise, as seen in \eqref{eq:lmc}. This similarity has been exploited in various settings for sampling problems \citep{raginsky2017non,chatterji2018theory,wibisono2019proximal,salim2019stochastic,karagulyan2020penalized}. Specifically, federated Langevin algorithms combine LMC with existing optimization mechanisms, such as LMC+FedAvg \citep{mcmahan2017communication-efficient,deng2021convergence,plassier2022federated}, LMC+MARINA \citep{gorbunov2021marina,marina-langevin}, and LMC+QSGD \citep{alistarh2017qsgd,pmlr-vono22a-fed-lmc}. Our work extends this line of research by introducing error-feedback mechanisms EF21 and EF21-P to the classic LMC algorithm in the federated setting.

  \paragraph{Relaxing strong convexity}
  Strong convexity of the potential function plays a crucial role in the analysis of LMC. 
  Non-convex optimization has long been a central topic in the domain, while sampling from non-strongly log-concave distributions is less studied. 
  Previous studies on LMC convergence focused on strong convexity outside a ball \citep{cheng2018sharp}, penalization of the convex potential \citep{dalalyan2019bounding,karagulyan2020penalized}, and non-convex regimes \citep{mangoubi2019nonconvex}. 
  However, these results either do not cover the general non-convex case or require conditions that scale poorly with the dimension. 
  A more efficient approach is based on isoperimetric inequalities, as they imply a rapid mixture of continuous stochastic processes \citep{villani2008optimal}. 
  \citet{vempala2019rapid} proved LMC convergence under Log-Sobolev inequality, and \citet{marina-langevin} extended this scheme to LMC with stochastic gradient estimators in the context of federated Langevin sampling. Our work simplifies their proof and adapts it to our setting.

  \paragraph{Bayesian approach to FL}
  Most FL algorithms currently focus on minimizing the training loss. However, they fail to provide reliable uncertainty quantification mechanisms, which is necessary for safety-critical applications according to some studies \citep{coglianese2016regulating,fatima2017survey}. 
  To address this issue, various authors \citep{welling2011bayesian,yurochkin2019bayesian,chenfedbe,izmailov2021bayesian,wilson2022evaluating,vedadi2024federated} have proposed using the federated version of Bayesian inference. 
  For example, the aim can be to calculate the regions with the highest posterior density of the predictive distribution. An important particular case is the Bayesian Neural Networks. Using Bayesian inference in neural networks can lead to better predictions, more accurate uncertainty measurements, and a systematic way of comparing different models. It can also support active learning, continual learning, and decision-making when there is uncertainty. The Bayesian deep learning community has developed several practical methods that use the Bayesian approach \citep{gal2016dropout}, which have been successful in various fields, including astrophysics \citep{cranmer2021bayesian}, diagnosing diabetic retinopathy \citep{filos2019systematic}, predicting click-through rates in advertising \citep{liu2017pbodl}, and analyzing fluid dynamics \citep{geneva2020modeling}. 


\cite{cao2023bayesian} give a broad overview on Bayesian federated learning, which is the Bayesian approach to federated learning, that targets issues such as data heterogeneity and client variability.


  \paragraph{Federated sampling algorithms}

All the competitor papers study federated sampling \emph{without compressing the iterate information}, unlike our algorithms D-ELF and B-ELF. 
See Sections \ref{sec:downlink} and \ref{sec:bidir} for formal definitions. 

A standard reference of federated Langevin sampling is the QLSD algorithm by \cite{pmlr-vono22a-fed-lmc}. 
However, they require strong log-concavity of the target distribution. 
Our analysis, instead, relies on the log-Sobolev inequality, which is a strictly more general assumption.

Another notable method is the federated averaging Langevin dynamics (FALD) \citep{deng2021convergence}. 
Federated averaging uses local methods as an alternative to compression to reduce communication complexity. 
As in the case of QLSD, the analysis is performed only for log-concave targets.

The paper by \cite{liang2024bayesian} studies federated averaging with Hamiltonian Monte-Carlo.  
The iteration of the HMC algorithm requires solving a differential equation, and thus is more computationally expensive when compared to first-order Langevin Monte-Carlo based methods. 
Moreover, the convergence analysis in the paper assumes a significantly stronger regularity condition, specifically, second-order smoothness, which combined with the stronger oracle of HMC might lead to faster convergence. 




\subsection{Structure of the paper}

This paper is organized as follows. 
\Cref{sec:framework} describes the mathematical framework of the problem, the notation, definitions, and assumptions.
In Sections \ref{sec:uplink} and \ref{sec:downlink}, we present respectively the downlink and uplink compressed Langevin algorithms. That is D-ELF and P-ELF. 
In \Cref{sec:bidir}, we introduce our bidirectional federated Langevin algorithm: B-ELF.
The main convergence results are presented in  \Cref{sec:theorems}.
The analysis of all three methods is influenced by  \cite{vempala2019rapid} and \cite{marina-langevin}.  
We simplify and adapt their proofs to our method; see \Cref{sec:proof-scheme} and \Cref{sec:proof_ef21-lmc}. 
\Cref{sec:experiments} provides simple experiments, comparing the proposed algorithm with LMC in the federated setting. 
We conclude the main part of the paper with \Cref{sec:conclusion}. 

\section{Problem setup}\label{sec:framework}

We denote by $\RR^d$ the $d$-dimensional Euclidean space endowed with its usual scalar product and 
${\ell}_2$-norm defined by 
$\langle \cdot, \cdot\rangle$ and $\norm{\cdot}$.
The gradient of the function $H$ and its Hessian evaluated at the point $x \in \RR^d$ is denoted by $\nabla H (x)$ and $\nabla^2 H(x)$, respectively.
As mentioned, we will repeatedly use the same notation for probability distributions and their corresponding densities.
For the asymptotic complexity of the algorithms, we will use the $\cO$ and $\tilde{\cO}$ notations. We say that $f(t) = \cO(g(t))$ when $t \rightarrow +\infty$, if $f(t) \leq Mg(t)$, for some $M > 0$ and when $t$ is large enough. 
Similarly, $f(t) = \tilde{\cO}(g(t))$, if $f(t)\log(t) = \cO(g(t))$. For two measures $\mu$ and $\nu$, we use $\nu \ll \mu$ to denote that $\nu$ is absolutely continuous with respect to $\mu$.

\subsection{Mathematical framework}

The vast majority of optimization and sampling literature relies on the $L$-smoothness assumption. 
\begin{assumption}[$L$-smoothness]
\label{def:smoothness}
  The potential function is $L$-smooth. That is, for every $x,y \in \RR^d$ 
  \begin{equation*}
    F(y) \leq F(x) + \inner{\nabla F(x)}{y-x} + \frac{L\normsq{x-y}}{2}. 
  \end{equation*}
\end{assumption}

EF21 and EF21-P rely on contractive compressors to reduce the communication complexity. 
\begin{definition}[Contractive compressor]
  \label{def:compression}
  A stochastic mapping $\mathcal{Q}: \mathbb{R}^{d} \rightarrow \mathbb{R}^{d}$ is a contractive compression operator with a coefficient $\alpha \in (0,1]$ if for any $x \in \mathbb{R}^{d}$,
  \begin{equation*}
    % \label{eq:comp-var}
   \Exp{\|\mathcal{Q}(x)-x\|^{2}} 
    \leq (1 - \alpha)\|x\|^{2}.
  \end{equation*}
  We denote it shortly as $\cQ \in \BB(\alpha)$.
\end{definition}
Here, we notice that we do not require unbiasedness. 
In many federated learning algorithms, unbiased compressors with bounded variance are used (see e.g. \citep{konevcny2016federated,alistarh2017qsgd,mishchenko2019distributed,gorbunov2021marina}). 
Unbiased compressors are defined as (possibly stochastic) mappings such that $\EE[\cQ(x)] = x$ and $\EE[\normsq{\cQ(x) - x}] \leq \omega \normsq{x}$. Then, simple computation shows that $\frac{1}{\omega+1}\cQ$ is a $\frac{1}{\omega+1}$-contractive compressor.
However, the class of contractive compressors is strictly larger.
Indeed. Let us look at the Top-$\tau$ compressor \citep{alistarh2017qsgd}.
This compressor returns only the $\tau$ coordinates with the largest absolute values of the input vector.
For example, if $x = (-4,3,10,-1,2)^{\top}$, then we have $\cQ_{\text{Top-}2}(x) = (-4,0,10,0,0)^{\top}$.
It is obvious that Top-$\tau$ cannot be represented with unbiased compressors, as it is deterministic.
This, concludes the argument.

Our analysis relies on the interpretation of sampling as an optimization problem over the space of measures. In order to reformulate our problem, let us first recall the definition of the 
 Kullback-Leibler divergence.
\begin{definition}[Kullback-Leibler divergence]
  The Kullback-Leibler divergence between two probability measures $\nu$ and $\pi$ is defined as
   \begin{equation*}
    \KL{\nu} = 
    \begin{cases}
       \int_{\RR^d}  \log\left(\frac{\nu(x)}{\pi(x)}\right)\nu(x)\rmd x, \text{ if } \nu \ll \pi;\\
       +\infty, \text{ otherwise}.
    \end{cases}
  \end{equation*}
\end{definition}

We aim to construct approximate samples from $\pi$ with $\varepsilon$ accuracy. 
That is to sample from some other distribution $\nu$ such that $\KL{\nu} < \varepsilon$.
Alternatively, it means that we want to minimize the functional:
\begin{equation*}
  \min_{\nu \in \cP(\RR^d)} \KL{\nu}.
\end{equation*}
Indeed, the minimum of this functional is equal to zero and  is attained only when $\nu = \pi$.
Recall now the classical problem of optimization, that is minimizing a $H: \RR^d \rightarrow \RR$. \citet{polyak1963gradient} and \citet{lojasiewicz1963topological} independently proposed an inequality, which is weaker than strong convexity, but it nevertheless implies linear convergence of the gradient descent.
It is known under the joint name of Polyak-{\L}ojasiewicz inequality:
\begin{equation*}
  H(x) - \min_x H(x) \leq  \frac{1}{\mu} \normsq{\nabla H(x)},
\end{equation*}



\begin{algorithm}[t]
  \caption{D-ELF}\label{alg:ef21_langevin}
  \begin{algorithmic}[1]
    \STATE {\bfseries Input:} Initialization $x_0 \sim\rho_0 $, $g^i_0 = \nabla F_i(x_0)$ $g_0 = \nabla F(x_0)$, step-size $h$, iterations $K$
    \FOR {$k=0,1,2,\ldots,K-1$}
    
    \STATE \underline{The server:} 
    \STATE \hspace{0cm} draws $Z_{k}\sim\mathcal{N}(0,I_{d})$;
    \STATE \hspace{0cm} $\circ$ $x_{k+1}=x_k-\gamma g_{k} {{+ \sqrt{2\gamma}Z_{k}}}$;
    \STATE  \hspace{0cm}  broadcasts $x_{k+1}$;
    \STATE \underline{The devices in parallel:}  
    \STATE \hspace{0cm} $\circ$ $g^i_{k+1} = g^i_k + \cQ^{\rm D}(\nabla F_i(x_{k+1}) - g^i_k)$;
    \STATE \hspace{0cm}  broadcast  $\cQ^{\rm D}(\nabla F_i(x_{k+1}) - g^i_k)$;  
    \STATE \underline{The server:} 
    \STATE   $\circ$ $ g_{k+1}  =  g_k + \frac{1}{n} \sum_{i=1}^{n} 
    \cQ^{\rm D}( \nabla F_i(x_{k+1}) - g^i_{k})$.
    \ENDFOR 
    \STATE {\bfseries Return:} $x_K$
  \end{algorithmic}
\end{algorithm}
 
assuming the objective has a minimum.
See \citep{karimi2016linear,khaled2020better} for more details on the PŁ inequality, as well as its comparison with other similar conditions for non-convex optimization.
In the problem of sampling, the objective functional is defined on the space of measures $\cP(\RR^d)$.
One can define the usual notions of differentiability and convexity on this space using the Wasserstein distance \citep{ambrosio2008gradient}.
Then, the Langevin Monte-Carlo algorithm becomes a first order minimization method for the {\sf KL} divergence \citep{wibisono2018sampling}.
Furthermore, Fisher information takes the role of the square norm of the gradient.
\begin{definition}[Fisher information]
 The Fisher information of probability measures $\nu$ and $\pi$ is denoted by $\FS{\nu}$ and it is defined as below:
 % %\vspace{-.2cm}
  \begin{equation*}
    \FS{\nu}:=
   \begin{cases}
       \int_{\RR^d}\normsq{\nabla \log\brr{\frac{\nu}{\pi}}}\nu(x)\rmd x, \text{ if } \nu \ll \pi;\\
       +\infty, \text{ otherwise}.
    \end{cases}
  \end{equation*}
\end{definition}
% %\vspace{-.52cm}
Since the minimum of our functional is equal to zero, the Log-Sobolev inequality (LSI) becomes the analog of PŁ inequality.
\begin{assumption}[Log-Sobolev inequality]
    \label{def:LSI}
  The target $\pi$ satisfies the Log-Sobolev inequality (LSI) with parameter $\mu$. That is   for every probability measure $\nu \in \cP(\RR^d)$ we have
  % %\vspace{-.5cm}
  \begin{equation*}
    \KL{\nu}\leq \frac{1}{2\mu}\FS{\nu}.
  \end{equation*} 
  % %\vspace{-1.5cm}
\end{assumption}

\citet{bakry1985diffusions} have shown that strongly log-concave distributions satisfy LSI.
Furthermore, from Holley-Stroock's theorem we know that sufficiently small perturbations of strongly concave distributions still satisfy LSI \citep{holley1986logarithmic}. 
The latter distributions can be non log-concave, which means that we deal with a strictly larger class of probability measures using LSI.  

Analyzing the sampling problems as an optimization problem on the Wasserstein space has been strongly influenced by the seminal paper of \citet{jordan1998variational}. It has later been developed in subsequent work; see e.g. \citep{wibisono2018sampling,durmus2018analysis}. 
We use Log-Sobolev inequality to derive bounds on the convergence error in {\sf KL} divergence. 



\section{The ELF algorithms}

  In this section, we present two federated Langevin Monte-Carlo algorithms, combining EF21 and EF21-P with LMC. 
  We replace the gradient term $\nabla F(x_k)$ at each iteration with the gradient estimator $g_k$ from the corresponding error feedback method, and add independent Gaussian noise. 
  Details can be found in \Cref{alg:ef21_langevin} and \Cref{alg:p-elf}. 
  The pseudocode distinguishes between optimization and sampling methods with a wave symbol.

  \subsection{Dual compression: D-ELF}\label{sec:uplink}

  The gradient estimator $g_k$ of the dual method is defined as the average of the vectors $g^i_k$,
  where each $g^i_k$ is computed on the $i$-th node and estimates the gradients $\nabla F_i(x_k)$. 
  The key component of this estimator is the contractive compression operator $\cQ^{\rm D} \in \BB(\alpha^{\rm D}) $. 
  At the zeroth iteration, $g_0 = \nabla F(x_0)$. 
  Then at iteration $k$, the server computes the new iterate $x_{k+1}=x_k-\gamma g_{k} {+ \sqrt{2\gamma}Z_{k}}$ and broadcasts it parallelly to all the nodes. 
  Each node  updates $g^i_k$ with the formula:
  \begin{equation*}
    g^i_{k+1} = g^i_k + {\cQ^{\rm D}(\nabla F_i (x_{k+1}) - g^i_k)},
  \end{equation*}
  and broadcasts the compressed term to the server.
  The server aggregates the received information and computes the estimator of $\nabla F_i (x_{k+1})$:
  \begin{equation*}
    g_{k+1} = g_k + \frac{1}{n}\sum_{i=1}^{n}\cQ^{\rm D}(\nabla F_i (x_{k+1}) - g^i_k).
  \end{equation*}
  For the pseudocode of the D-ELF, please refer to \Cref{alg:ef21_langevin}. 



\subsection{Primal compression: P-ELF}\label{sec:downlink}


   
\begin{algorithm}[t]
  \caption{P-ELF}\label{alg:p-elf}
  \begin{algorithmic}[1]
    \STATE {\bfseries Input:} Starting point $x_0 = w_0 \sim\rho_0 $, step-size $h$, number of iterations $K$
    \FOR {$k=0,1,2,\cdots,K-1$}
    \STATE \underline{The server:} 
    \STATE \hspace{0cm} draws  $Z_{k}\sim\mathcal{N}(0,I_{d})$;
    \STATE \hspace{0cm} $\circ$ $\nabla F(w_{k})  = \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(w_{k})$;
    \STATE \hspace{0cm} $\circ$ $x_{k+1}=x_k-\gamma \nabla F(w_{k}) {{ + \sqrt{2\gamma}Z_{k}}}$;
    \STATE  \hspace{0cm} $\circ$ $w_{k+1} = w_k + \cQ^{\rm P}(x_{k+1}-w_k)$;
    \STATE  \hspace{0cm}  broadcasts in parallel  $\cQ^{\rm P}(x_{k+1}-w_k)$.
    \STATE \underline{The devices in parallel:}  
    \STATE \hspace{0cm} $\circ$ $w_{k+1} = w_k + \cQ^{\rm P}(x_{k+1}-w_k)$;
    \STATE \hspace{0cm} $\circ$  $\nabla F_i(w_{k+1})$;
    \STATE \hspace{0cm} broadcast  $\nabla F_i(w_{k+1})$;
    \ENDFOR
    \STATE {\bfseries Return:} $x_K$
  \end{algorithmic}
\end{algorithm}

The construction of the P-ELF algorithm is similar to the D-ELF. 
In particular, we take the EF21-P algorithm by \citet{sasha_kaja_EF-21P} and add only the independent Gaussian term. See \Cref{alg:p-elf} for the complete definition. 
To better understand the comparison  of  the D-ELF and the P-ELF let us look at the simple one-node setting of the latter:
{
\begin{equation}
  \begin{cases}
    w_0 := \cQ^{\rm P}(x_0)\\
    w_{k+1} = w_k + \cQ^{\rm P}(x_{k+1} - w_k)\\
    x_{k+1} = x_k - \gamma \nabla F(w_{k}) + \sqrt{2\gamma} Z_k.\\
  \end{cases}
\end{equation}
Here, $x_0 \sim \rho_0$ is a random starting point, $\cQ^{\rm P} \in \BB(\alpha^{\rm P})$, and $(Z_k)_k$ is a sequence of i.i.d. standard Gaussians on $\RR^d$.
If we remove the additive Gaussian noise $Z_k$, then we recover the P-ELF algorithm, which is known to converge to the minimum of the potential function $F$
\citep{sasha_kaja_EF-21P}. 
The auxiliary sequence $w_k$ is meant to estimate the iterate $x_k$. 
We then use its gradient as the minimizing direction. 
The important difference with the EF21 is that we apply the compressor $\cQ^{\rm P}$ on the term $x_{k+1} - w_k$, instead of the gradient and its estimator. Hence, the letter "P"-primal in the name of the algorithm. 




\subsection{Bidirectional compression: B-ELF}\label{sec:bidir}

This section focuses on the bidirectional setting. 
We propose the B-ELF algorithm. 
The algorithm uses EF21 for the uplink and EF21-P for the downlink compression. 
We use the same notation as for the previous methods and the 
 details are presented in \Cref{alg:bidir_langevin}.




\begin{algorithm}[h!]
  \caption{B-ELF}\label{alg:bidir_langevin}
  \begin{algorithmic}[1]
    \STATE {\bfseries Input:} Starting point $x_0 = w_0 \sim\rho_0 $, step-size $\gamma$, number of iterations $K$,
    $g_0 = \nabla F(x_0)$, $g^i_0 = \nabla F_i(x_0)$.
    \FOR {$k=0,1,2,\cdots,K-1$}
    \STATE \underline{The server:} 
    \STATE  draws a Gaussian vector $Z_{k}\sim\mathcal{N}(0,I_{d})$;
    \STATE  computes $x_{k+1}=x_k- \gamma g_{k} + \sqrt{2\gamma}Z_{k}$;
    \STATE   computes $v_k := \cQ^{\rm P}(x_{k+1}-w_k)$;
    \STATE  computes $w_{k+1} = w_k + v_k$;
    \STATE    broadcasts $v_k$ in parallel to the devices;
    
    \STATE \underline{The device $i$ (in parallel for all $i=1,\ldots,n$):} 
    \STATE  computes $w_{k+1} = w_k + v_k$;
    \STATE   computes $h^i_{k+1} = \cQ^{\rm D}(\nabla F_i(w_{k+1}) - g^i_{k})$;
    \STATE computes $g^i_{k+1} = g^i_k + h^i_{k+1} $;
    \STATE  broadcasts  $h_i^{k+1}$;
    \STATE \underline{The server:} 
    \STATE  computes $g_{k+1} =  g_k + \frac{1}{n}\sum_{i=1}^{n} h^i_{k+1} $;
    \ENDFOR
    \STATE {\bfseries Return:} $x_K$
  \end{algorithmic}
\end{algorithm}





\section{Convergence of the methods}\label{sec:theorems}


\subsection{A unified analysis of D-ELF and P-ELF}\label{sec:delf-analysis}

  The key component of the analysis of both methods is defining proper a Lyapunov-type function.
  For the D-ELF algorithm we define by $\bG^{\rm D}_k$ the average squared estimation error of the vectors $g^i_k$:
  \begin{equation}\label{eq:gk-ef21}
    \bG^{\rm D}_k := \frac{1}{n} \sum_{i}^{n} \EE\brs{\norm{g^i_k - \nabla F_i(x_k)}^2}.
  \end{equation}
  As we will later in \Cref{sec:proof-scheme}, this quantity arises in the proof of the convergence rates. 
  Important property of the sequence $\bG_k$ is the following recurrent identity.
  \begin{proposition}\label{prop:uplink}
    Let $x_k$ be the iterates of the D-ELF, $g^i_k$ be the EF21 estimators and $\bG^{\rm D}_k$ be defined as \eqref{eq:gk-ef21}.
    Then the following recurrent  inequality is true:
    \begin{equation}\label{eq:prop-dual}
      \bG^{\rm D}_{k+1} \leq (1 - p)\bG^{\rm D}_k + (1 - p)\beta_{\rm D}  \EE\brs{\norm{x_{k + 1} - x_k}^2},
    \end{equation}
    where  $p := 1 - (1-\alpha_{\rm D})(1+s_{\rm D}) >0$
    \begin{equation*}
      \begin{aligned}
        \bar{L} := \frac{1}{n}\sum_{i=1}^{n} L_i^2 \quad \text{and} \quad
        \beta_{\rm D} := \frac{1+s_{\rm D}^{-1}}{1 + s_{\rm D}}\bar{L},
      \end{aligned}
    \end{equation*} 
    for some $s_{\rm D}>0$.    
  \end{proposition}
  The Lyapunov term associated to the P-ELF is a simple upper bound on $\bG^{\rm D}$. 
  We denote it by $\bG^{\rm P}_k$ and define with the formula below:
    \begin{equation}\label{eq:tk-def}
    \begin{aligned}
      \bG^{\rm P}_k := \bar{L}\Exp{\norm{w_k - x_k}^2},    
      \hspace{.2cm} \text{where} \hspace{.2cm}
      \bar{L} := \frac{1}{n}\sum_{i=1}^{n} L_i^2.
    \end{aligned}
    \end{equation}
  Indeed, $\bG^{\rm D}_k \leq \bG^{\rm P}_k$ due to $L_i$ smoothness of each component function $F_i$.
  See \eqref{eq:gd-gp} in \Cref{sec:proof_ef21-lmc} for the proof.
  The following proposition proves a recurrent identity similar to \eqref{eq:prop-dual}.
  \begin{proposition}\label{prop:downlink}
    Let $x_k$ and $w_k$ be defined as in P-ELF and $\bG^{\rm P}$ be its Lyapunov term.
    Then the following recurrent  inequality is true:
    \begin{equation*}
      \bG^{\rm P}_{k+1} 
      \leq (1 - p)\bG^{\rm P}_k + (1 - p)\beta_{\rm P} \EE\brs{\norm{x_{k + 1} - x_k}^2},
    \end{equation*}
    where  $p := 1 - (1-\alpha_{\rm P})(1+s_{\rm P}) >0,$ and 
        $\beta_{\rm P} := \frac{1+s_{\rm P}^{-1}}{1 + s_{\rm P}}\bar{L},$
    for some $s_{\rm P}>0$.    
  \end{proposition}
The next theorem gives a unified bound for both D-ELF and P-ELF. 
For the sake of space we use a general notation M-ELF, where $\text{ M} \in \brc{\text{D,P}}$. 
This means, for example, that the M-ELF refers to the D-ELF when $\text{M} = \text{D}$.
\begin{theorem}
  \label{thm:ef21-lmc}
  Let $x_k$ be the iterates  of the M-ELF algorithm, where $\text{M} \in \brc{\text{D,P}}$.   
  We denote by  $\rho_k := \cL({x_k})$ for every $k \in \NN$. 
  Under Assumptions \ref{def:smoothness} and \ref{def:LSI}, if 
  \begin{equation*}
    0<\gamma\leq \min \left\{\frac{1}{14}\sqrt{\frac{p}{ (1+\beta_{\rm M})}},\frac{p}{6\mu}, \frac{1}{2\sqrt{2}L}\right\},
  \end{equation*}
  then the following is true for the {\sf KL} error of the M-ELF algorithm:
  \begin{equation*} 
     \KL{\rho_{K}}\leq e^{-\mu K \gamma}\Psi+\frac{\tau}{\mu},
  \end{equation*}
  where $p := 1 - (1-\alpha_{\rm M})(1+s_{\rm M}) > 0$, $\tau =\left(2{L^2}+C(1-p) \beta_{\rm M}\right)\left(16 \gamma^2d+4d\gamma\right)$,
   { 
   \begin{align*}
      \Psi &=\KL{\rho_0}+\frac{1-e^{-\mu \gamma}}{\mu}C\bG^{\rm M}_{0},\\
    C &=\frac{8{L^2}\gamma^2+2}{e^{-\mu \gamma}-(1-p)\left(4\gamma^2\beta_{\rm M}+1\right)}.
    \end{align*}}
\end{theorem}
We refer the reader to \Cref{sec:proof_ef21-lmc} for the proof of the theorem.
The right-hand side consists of two terms. 
The first term corresponds to the convergence error, while the second term is the bias that comes from the discretization.
To make the error small, one would first need to choose $\gamma$ small enough so that $\tau/\mu < \varepsilon$. 
Then, the number of iterations are chosen to be of order $\tilde{\cO}(\nicefrac{1}{\mu\gamma})$.
See \Cref{sec:discussion} for more on the complexity of D-ELF and P-ELF.

These bounds can also be extended to other probability distance metrics, such as {\sf TV} and $W_2$.
The relation of {\sf TV} and {\sf KL} is established with Pinsker's inequality:
  ${\sf TV}(\nu_1,\nu_2) \leq \sqrt{\frac{1}{2} {H_{\nu_2}\left( \nu_1 \right)}}.$
Thus, the convergence in {\sf KL} divergence implies convergence in {\sf TV}.
Similar result is true for the Wasserstein-2 distance.
% \begin{definition}[Wasserstein-2]
%     \label{def:W2}
%    Let  $\nu_1,\nu_2 \in \mathcal{P}_{2} (\RR^d)$. That is, their second moments are finite. 
%   The Wasserstein-$2$ distance between two probability measures is defined as 
%   \begin{equation*}
%     W_2(\nu_1,\nu_2) := \inf _{\eta \in {\Gamma}(\nu_1, \nu_2)} \left[\int\|x-y\|^{2} 
%     \eta(\rmd x, \rmd y)\right]^{1/2},
%   \end{equation*}
%   $\Gamma(\nu_1,\nu_2)$ is the set of all 
%   joint distributions defined on $\RR^d \times \RR^d$ having $\nu_1$ and $\nu_2$ as its marginals (also known as couplings).
% \end{definition}
It is known that LSI implies Talagrand's inequality \citep{otto2000generalization}. The latter bounds the $W_2$ distance with {\sf KL} divergence: $W_2(\nu,\pi)\leq \sqrt{\frac{2\KL{\nu}}{\mu}}$ for all $ \nu \in\cP_2(\RR^d)$.
Again, from the convergence in {\sf KL} we can deduce convergence in $W_2$.
\begin{table}[t]
  \caption{In this table we compare error-feedback methods in optimization and sampling. 
  The rates are computed in the case when $\alpha_{\rm D} = \alpha_{\rm P} = \alpha$. }
  \label{table1}
  % \vskip 0.15in
  \begin{center}
    \begin{scriptsize}
    \begin{sc}
      \begin{tabular}{lccl}
      \toprule
        Method &  \hspace{-.3cm}  Assumption  & \hspace{-.3cm} Complexity &\hspace{-.3cm}  Reference \\ 
        \midrule
        GD    & \hspace{-.3cm} $\mu$-s.c. & \hspace{-.3cm}  $\tilde{\cO}\brr{\frac{dL}{\mu\varepsilon}}$ & \hspace{-.3cm} \citet{nesterov2013introductory}\\ %\vspace{.15cm}
        EF21 &  \hspace{-.3cm} $\mu$-s.c.& \hspace{-.3cm}  $\tilde{\cO}\brr{\frac{L}{\alpha\mu\varepsilon}}$ &\hspace{-.3cm}  \citet{richtarik2021ef21}\\%\vspace{.15cm}
        EF21-P  &  \hspace{-.3cm} $\mu$-s.c.& \hspace{-.3cm}  $\tilde{\cO}\brr{\frac{L}{\alpha\mu\varepsilon}}$ & \hspace{-.3cm} \citet{sasha_kaja_EF-21P} \\
        \midrule
        %\vspace{.15cm}
        LMC    & \hspace{-.3cm}   $\mu$-LSI  & \hspace{-.3cm}    $\tilde{\cO}\brr{\frac{L^2 d}{\mu^2\varepsilon}}$ &\hspace{-.3cm}  \citet{vempala2019rapid}     \\%\vspace{.15cm}
        D-ELF    &  \hspace{-.3cm}   $\mu$-LSI  &  \hspace{-.3cm} $\tilde{\cO}\brr{\frac{\bar{L}d }{\alpha^2\mu^2\varepsilon}}$  &\hspace{-.3cm}  Corrollary~\ref{corr:delf-conv}\\%\vspace{.15cm}
        P-ELF    &  \hspace{-.3cm}   $\mu$-LSI  &  \hspace{-.3cm} $\tilde{\cO}\brr{\frac{\bar{L}d }{\alpha^2\mu^2\varepsilon}}$  &\hspace{-.3cm}  Corrollary~\ref{corr:delf-conv}\\%\vspace{.15cm}
        B-ELF     &  \hspace{-.3cm}  $\mu$-LSI  & \hspace{-.3cm}   $\tilde{\cO}\brr{\frac{\bar{L} d}{\alpha^4\mu^2\varepsilon}}$& \hspace{-.3cm} Corrollary~\ref{corr:belf-conv}  \\
      \bottomrule
      \end{tabular}
    \end{sc}
    \end{scriptsize}
  \end{center}
  \vskip -0.1in
\end{table}


\subsection{Convergence analysis of the B-ELF}\label{sec:belf-analysis}

The Lyapunov term for the B-ELF algorithm is the same as for the D-ELF, that is 
 $\bG_k^{\rm D}$.
However, the recurrent identity of Proposition~\ref{prop:uplink} is not valid in this case.
Instead, another bound is true which  includes the term $\bG^{\rm P}_k$.  
The latter arises because of the downlink compression.
We present \textit{informally} the new recurrent inequality.
We refer the reader to Proposition~\ref{prop:bidir} in the Appendix for the complete statement.
\begin{proposition}[Informal]\label{prop:bidir_incom}
  If $x_{k}$ are the iterations of \Cref{alg:bidir_langevin}, $\bG^{\rm D}_k$ and $\bG^{\rm P}_k$ are defined as in \eqref{eq:gk-ef21} and \eqref{eq:tk-def}, then  
  \begin{equation*}
      \bG^{\rm D}_{k+1} \leq \lambda_1 \bG^{\rm D}_k + \lambda_2 \EE\brs{\norm{ x_{k}-  x_{k+1}}^2} + \lambda_3 \bG^{\rm P}_k,
  \end{equation*}
  where  $\lambda_1,\lambda_2$ and $\lambda_3$ are positive numbers. 
\end{proposition}


\begin{theorem}\label{thm:bidir}
  Let $x_k$ be the iterates  of the B-ELF algorithm.   
  We denote by  $\rho_k := \cL({x_k})$ for every $k \in \NN$. 
  Under Assumptions \ref{def:smoothness} and \ref{def:LSI}, if 
  \begin{equation*}
    \begin{aligned}
      \gamma 
      &\leq  \min\brc{\frac{\alpha_{\rm D}}{4\mu},\frac{\alpha_{\rm P}}{4\mu}, \frac{\alpha_{\rm D}\alpha_{\rm P}}{495\sqrt{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}} \bar{L}}}}.
    \end{aligned}
  \end{equation*}
  Then, for every $K \in \NN$,
  \begin{equation*}
    \begin{aligned}
      {\KL{\nu_K}}  
      & \leq e^{-\mu \gamma K} \brs{ \KL{\rho_0} 
      +\frac{1 }{\mu} \brr{  C \bG^{\rm D}_0 +   D\bG^{\rm P}_0} } + \frac{\tau }{\mu}, 
    \end{aligned}
  \end{equation*}
  where $C,D>0$ are constants depending on the parameters of the algorithm and 
  \begin{equation*}
    \begin{aligned}
      C &= \frac{2.125}{e^{-\mu \gamma} -\lambda_1}, \quad
      D =\frac{ C\lambda_3}{ {e^{-\mu \gamma}  - (1-\alpha_{\rm P}) (1+w) }}, \\
      \tau &= \brr{2L^2 + \frac{5C\lambda_2}{\alpha_{\rm P}}}\brr{16\gamma^2{dL}+ 4d\gamma}.
    \end{aligned}
  \end{equation*}
\end{theorem}
The exact definitions of the undefined constants are written in the proof of the theorem, which is postponed to \Cref{sec:proof_bidir}. 
\subsection{Discussion on the communication complexity}\label{sec:discussion}

Doing the computations as mentioned at the end of \Cref{sec:delf-analysis}, we can deduce the following.
\begin{corollary} \label{corr:delf-conv}
  Under the assumptions of Theorem~\ref{thm:ef21-lmc} and
  $\gamma=\mathcal{O}\left(\frac{\mu p\varepsilon}{\beta_{\rm M} d}\right)$,  $K= {\mathcal{O}}\left(\frac{(1+\beta_{\rm M}) d}{\mu^2 p\varepsilon}\log \left(\frac{\Psi}{\varepsilon}\right)\right)$, the primal and dual ELF algorithms satisfy 
  $\KL{\rho_{K}} \leq \varepsilon$.
\end{corollary}  
Similarly, for the bidirectional ELF we have the below.
\begin{corollary}\label{corr:belf-conv}
  If $\alpha_{\rm P} = \alpha_{\rm D} = \alpha < 1/2$, under the conditions of Theorem~\ref{thm:bidir}, the iteration complexity for the B-ELF is 
  $\tilde{\cO}(\nicefrac{d \bar{L}}{\alpha^4 \mu^2\varepsilon})$.
\end{corollary}

The proof of Corrollary~\ref{corr:belf-conv} is in \Cref{sec:proof-corr-belf}. 
When $1/\alpha = O(1)$, the rate of the LMC algorithm is recovered for all three algorithms. 
In particular, the scaled unbiased compressors, such as $\frac{8}{9}\cQ^{\rm nat}$, have a contractive coefficient of $\frac{8}{9}$. 
Our analysis may not match the usual LMC for other compressors, as the  
communication complexity is $\tilde{\cO}(d^2/\varepsilon)$ for LMC, while both the iteration and communication complexity is $\tilde{\cO}(d^5/\varepsilon)$ for B-ELF with Top-$1$.
However, in the next section we will see, that these theoretical bounds are conservative and that the performance of the proposed methods on simple classification tasks match the performance of LMC.



\section{Experiments}
\label{sec:experiments}

{\begin{figure*}[t]
  \centering
    \subfigure{
      \begin{minipage}[t]{\textwidth}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a9a_gamma_1_Top_5_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a8a_gamma_1_Top_5_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/mushrooms_gamma_1_Top_5_freq_10_m_100.pdf}
      \end{minipage}
    }
    \subfigure{
      \begin{minipage}[t]{\textwidth}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a9a_gamma_1_Top_10_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a8a_gamma_1_Top_10_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/mushrooms_gamma_1_Top_10_freq_10_m_100.pdf}
      \end{minipage}
    }
    \subfigure{
      \begin{minipage}[t]{\textwidth}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a9a_gamma_1_Top_50_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/a8a_gamma_1_Top_50_freq_10_m_100.pdf}
        \includegraphics[width=0.33\textwidth]{./plots/final_figures/mushrooms_gamma_1_Top_50_freq_10_m_100.pdf}
      \end{minipage}
    }
   
    \caption{Bayesian logistic regression with a Gaussian prior performed on three different datasets from LibSVM \citep{chang2011libsvm}. The $X$-axis represents the number of bits communicated, while the $Y$-axis represents the accuracy of the final estimator on the test set.}
    \label{fig:experiment}
\end{figure*}}

In this section, we conduct numerical experiments to compare \{B,D,P\}-ELF with the LMC. 
The code for the experiments can be found in \url{https://github.com/avetx/elf_code}. 
We implemented all four algorithms to solve a Bayesian logistic regression problem. 
The datasets are \texttt{a8a, a9a, mushrooms} for the LibSVM repository \citep{chang2011libsvm}.

In \Cref{fig:experiment}, we observe that all the methods have similar communication complexity on the abovementioned problem. 
In particular, this means that despite the theoretical results obtained above, the performance of ELF is not worse than LMC. Thus, in practice we achieve compression for free. 


\section{Conclusion}\label{sec:conclusion}

In this paper we proposed three error feedback based federated Langevin algorithms with dual, primal and bidirectional compression.
The first two are analyzed with one theorem and have similar theoretical performance. 
The third algorithm uses bidirectional compression which is slower due to the fact that EF21 and EF21-P do not couple. 
To the best of our knowledge, this is the first study of the federated sampling algorithms with bidirectional compression.
Our theoretical findings show that the communication complexity of this algorithm is worse than the one for the standard LMC, nonetheless, simple experiments show that the theoretical analysis is rather conservative and that it can still be improved. This phenomenon is not surprising, as it was also observed for the original EF21 algorithm.


\subsection{Future work}

An immediate continuation of our paper would be to conduct more thorough experimental analysis of the ELF algorithms with other federated sampling techniques on high-dimensional data. 
Another possible direction is the theoretical analysis of the Langevin algorithm combined with EF21-P+DIANA. 
The latter is a bidirectional federated optimization algorithm that uses DIANA gradient estimator for the uplink compression instead of EF21. This method matches the performance of the GD due to the coupling of two methods \citep{sasha_kaja_EF-21P}.

Finally, there are yet many important algorithms of optimization that are relevant to our setting. 
Adaptation of these methods to the sampling setting can lead to fruitful results. 


\paragraph{Acknowledgements} 

The research reported in this publication was supported by funding from King Abdullah University of Science and Technology (KAUST): i) KAUST Baseline Research Scheme, ii) CRG Grant ORFS-CRG12-2024-6460, iii) Center of Excellence for Generative AI, under award number 5940, and iv) SDAIA-KAUST Center of Excellence in Artificial Intelligence and Data Science.





\bibliography{new}
\newpage

\onecolumn

\appendix

\title{{ELF: Federated Langevin Algorithms with Primal, Dual and Bidirectional Compression}\\(Supplementary Material)}
\maketitle


\section{Proofs of the propositions}

\subsection{Proof of Proposition~\ref{prop:uplink}}\label{sec:proof_uplink}

  From the definition
    \begin{equation*}
      \begin{aligned}
        \bG^{\rm D}_{k+1} 
          &= \frac{1}{n} \sum_{i=1}^{n} \EE\brs{\norm{g^i_{k+1} - \nabla F_i(x_{k+1})}^2}\\
          &= \frac{1}{n} \sum_{i=1}^{n} 
          \EE\brs{\EE\brs{\norm{ g^i_k + \cQ^{\rm D}(\nabla F_i (x_{k+1}) - g^i_k) - \nabla F_i(x_{k+1})}^2\mid x_1,\ldots,x_{k+1}}}\\
          &\leq \frac{1-\alpha_{\rm D}}{n} \sum_{i=1}^{n} 
          \EE\brs{\norm{ g^i_k - \nabla F_i(x_{k+1})}^2}.
           \end{aligned}
    \end{equation*}
    Applying Cauchy-Schwartz and the Lipschitz continuity of the function $\nabla F_i(\cdot)$, we obtain
    \begin{equation*}
      \begin{aligned}
        \bG^{\rm D}_{k+1} 
          &\leq \frac{(1-\alpha_{\rm D})(1+s_{\rm D})}{n} \sum_{i=1}^{n}  \EE\brs{\norm{ g^i_k - \nabla F_i(x_{k})}^2} \\
          & +\frac{(1-\alpha_{\rm D})(1+s_{\rm D}^{-1})}{n} \sum_{i=1}^{n}  \EE\brs{\norm{\nabla F_i(x_{k}) - \nabla F_i(x_{k+1})}^2}\\
          &\leq {(1-\alpha_{\rm D})(1+s_{\rm D})} \bG^{\rm D}_{k}
               + \frac{(1-\alpha_{\rm D})(1+s_{\rm D}^{-1})}{n} \sum_{i=1}^{n}  L_i^2\EE\brs{\norm{x_{k} - x_{k+1}}^2}\\
          &\leq {(1-\alpha_{\rm D})(1+s_{\rm D})} \bG^{\rm D}_{k}
               + {(1-\alpha_{\rm D})(1+s_{\rm D}^{-1})} \bar{L} \EE\brs{\norm{x_{k} - x_{k+1}}^2}\\
          &\leq {(1- p_{\rm D})} \bG^{\rm D}_{k}
            + (1- p_{\rm D})\beta_{\rm D}  \EE\brs{\norm{x_{k} - x_{k+1}}^2}.
      \end{aligned}
    \end{equation*}
    This concludes the proof.

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

  \subsection{Proof of Proposition~\ref{prop:downlink}}
  From the definition
  \begin{equation}
    \begin{aligned}
      \bG^{\rm P}_{k+1} &= L^2\Exp{\norm{w_{k + 1} - x_{k + 1}}^2} \\
      &= L^2\Exp{\norm{w_{k} - x_{k + 1} - \cQ^{\rm P}(w_{k} - x_{k + 1})}^2} \\
      &= (1 - \alpha_{\rm P})L^2\Exp{\norm{w_{k} - x_{k + 1}}^2} \\
      &= (1 - \alpha_{\rm P})L^2\Exp{\norm{w_{k} - x_{k} + x_{k} - x_{k + 1}}^2}\\
      &\leq (1 - \alpha_{\rm P})(1+s_{\rm P})L^2\Exp{\norm{w_{k} - x_{k}}^2} 
      + (1 - \alpha_{\rm P})(1 + s^{-1}_{\rm P})L^2\Exp{\norm{x_{k} - x_{k + 1}}^2}.\\
    \end{aligned}
  \end{equation}
  Choosing $s_{\rm P}$ small enough, we can make the coefficient $(1 - \alpha_{\rm P})(1+s_{\rm P})$ smaller than one. 
  Thus, defining $p = 1 - (1 - \alpha_{\rm P})(1+s_{\rm P})$, we conclude the proof.

%%%%%%%%%%%%%%%%%%%%%%%%
  
  \subsection{Full statement of Proposition~\ref{prop:bidir_incom} and its proof}
  We state now the complete version of Proposition~\ref{prop:bidir_incom}.
\begin{proposition}\label{prop:bidir}
  The Lyapunov term $\bG^{\rm D}_k$ of the bidirectional Langevin algorithm satisfies the following recurrent inequality:
  \begin{equation*}
      \bG^{\rm D}_{k+1} \leq \lambda_1 \bG^{\rm D}_k + \lambda_2 \EE\brs{\norm{ x_{k}-  x_{k+1}}^2} + \lambda_3 \bG^{\rm P}_k,
  \end{equation*}
  where $\bG^{\rm P}_k := \bar{L}\EE\brs{\norm{ w_{k}-  x_{k}}^2}$ is the Lyapunov term for P-ELF and 
  \begin{equation}\label{eq:lambda-def}
      \begin{aligned}
        \lambda_1 &= (1-\alpha_{\rm D})(1+s)(1+q); \\
        \lambda_2 &= (1-\alpha_{\rm D})(1+s)(1+q^{-1})(1+u) \bar{L}
        \\ & + \brr{(1-\alpha_{\rm D})(1+s)(1+q^{-1})(1+u^{-1}) + (1+s^{-1})} (1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L};\\
        \lambda_3 &= \brr{(1-\alpha_{\rm D})(1+s)(1+q^{-1})(1+u^{-1}) + (1+s^{-1})} (1-\alpha_{\rm P}) (1 + w). \\
      \end{aligned}
    \end{equation}
    Here, $s,q,u,w$ are any positive numbers.
\end{proposition}
\begin{proof}
From the definition of $\bG^{\rm D}_k$ and  Young's inequality we have 
 \begin{equation*}
      \begin{aligned}
        \bG^{\rm D}_{k+1} 
          &= \frac{1}{n} \sum_{i=1}^{n} \EE\brs{\norm{g^i_{k+1} - \nabla F_i(x_{k+1})}^2}\\
          &= \frac{1}{n} \sum_{i=1}^{n} 
          \EE\brs{\EE\brs{\norm{ g^i_k + \cQ^{\rm D}(\nabla F_i (w_{k+1}) - g^i_k) - \nabla F_i(x_{k+1})}^2\mid x_1,\ldots,x_{k+1}}}\\
           &\leq \frac{1}{n} \sum_{i=1}^{n} \Big\{
          (1+s)\EE\brs{\EE\brs{\norm{ g^i_k + \cQ^{\rm D}(\nabla F_i (w_{k+1}) - g^i_k) - \nabla F_i(w_{k+1})}^2\mid x_1,\ldots,x_{k+1}}} \\
          &+ (1+s^{-1})\EE\brs{\norm{ \nabla F_i (w_{k+1}) - \nabla F_i(x_{k+1}) }^2 } \Big\}.
      \end{aligned}
    \end{equation*}
    The contractivity of $\cQ^{\rm D}$ implies
     \begin{equation*}
      \begin{aligned}
        \bG^{\rm D}_{k+1} 
          &\leq \frac{1}{n} \sum_{i=1}^{n} 
          (1-\alpha_{\rm D})(1+s) \EE\brs{\norm{ g^i_k - \nabla F_i(w_{k+1})}^2} 
          + (1+s^{-1})\bar{L} \EE\brs{\norm{ w_{k+1} - x_{k+1} }^2 }
          \\& \leq \frac{1}{n} \sum_{i=1}^{n} 
          (1-\alpha_{\rm D})(1+s)(1+q) \EE\brs{\norm{ g^i_k - \nabla F_i(x_{k})}^2}
          +(1-\alpha_{\rm D})(1+s)(1+q^{-1}) \EE\brs{\norm{  \nabla F_i(x_{k})- \nabla F_i(w_{k+1})}^2} 
          \\& + (1+s^{-1})\bar{L} \EE\brs{\norm{ w_{k+1} - x_{k+1} }^2 }
          \\& \leq 
          (1-\alpha_{\rm D})(1+s)(1+q) \bG^{\rm D}_k
          +(1-\alpha_{\rm D})(1+s)(1+q^{-1}) \bar{L} \EE\brs{\norm{ x_{k}-  w_{k+1}}^2} 
         + (1+s^{-1}) \bG^{\rm P}_{k+1}.
           \end{aligned}
    \end{equation*}
    Applying Young's inequality to the second term, we deduce
    \begin{equation*}
      \begin{aligned}
        \bar{L}\EE\brs{\norm{ x_{k}-  w_{k+1}}^2}
        &
        \leq (1+u)\bar{L}\EE\brs{\norm{ x_{k}-  x_{k+1}}^2} + (1+u^{-1})\bar{L} \EE\brs{\norm{ x_{k+1}-  w_{k+1}}^2}
        \\&
        = (1+u)\bar{L}\EE\brs{\norm{ x_{k}-  x_{k+1}}^2} 
        + (1+u^{-1}) \bG^{\rm P}_{k+1}.
      \end{aligned}
    \end{equation*}
    Therefore,
    \begin{equation*}
      \begin{aligned}
        \bG^{\rm D}_{k+1} 
          & \leq 
          (1-\alpha_{\rm D})(1+s)(1+q) \bG^{\rm D}_k
        +(1-\alpha_{\rm D})(1+s)(1+q^{-1})(1+u) \bar{L}\EE\brs{\norm{ x_{k}-  x_{k+1}}^2} 
        \\ & + (1-\alpha_{\rm D})(1+s)(1+q^{-1})  (1+u^{-1}) \bG^{\rm P}_{k+1}
          + (1+s^{-1})  {\bG^{\rm P}_{k+1}}.
      \end{aligned}
    \end{equation*}
    Let us now bound the auxiliary term $\bG^{\rm P}_{k+1}$. We notice that $\bG^{\rm P}_k$ is the Lyapunov term of the P-ELF algorithm. 
    Thus, from Proposition~\ref{prop:downlink} we have
    \begin{equation}\label{eq:tk-rec}
      \begin{aligned}  
        \bG^{\rm P}_{k+1} &= \bar{L}\EE\brs{\norm{ w_{k+1}-  x_{k+1}}^2} 
        \\ & \leq
          (1-\alpha_{\rm P}) (1+w)\bG^{\rm P}_k
          + (1-\alpha_{\rm P}) (1 + w^{-1})\bar{L}\EE\brs{\norm{  x_{k} -  x_{k+1}}^2}.
      \end{aligned}  
    \end{equation}
    Recalling the definitions of $\lambda_1,\lambda_2,\lambda_3$ we deduce 
    \begin{equation*}
      \bG^{\rm D}_{k+1} \leq \lambda_1 \bG^{\rm D}_k + \lambda_2 \EE\brs{\norm{ x_{k}-  x_{k+1}}^2} + \lambda_3 \bG^{\rm P}_k.
    \end{equation*}  
    This concludes the proof of the proposition.
\end{proof}


\section{Proofs of the main theorems}
\subsection{General scheme of the proofs}\label{sec:proof-scheme}

For all three algorithms the update of the LMC iteration is a stochastic estimator of the 
gradient $\nabla F(x_k)$.  
Generally, it depends on $x_{k}$ and  $\xi_k$, where $\xi_k$ is a sequence of i.i.d. random variables defined  on some probability space $(\Xi,\cF,\cP)$. 
The sequence $\xi_k$ comprises the randomness that arises at each step of the particular algorithm and it is independent of $x_k$. 
In order to prove convergence in {\sf KL} divergence, we use the interpolation method proposed in 
\citep{vempala2019rapid}.
The method is based on the Fokker-Planck equation of the Langevin diffusion. 
We state a lemma for general LMC algorithms with stochastic drift terms. 
In particular, all our algorithms can be generally written as
\begin{equation}\label{eq:stoch-lmc}
  x_{k+1} = x_k - \gamma f_{\xi_k}(x_k) + \sqrt{2\gamma} Z_k,
\end{equation}
where $\xi_k$ are i.i.d. random variables defined on some probability space $(\Xi,\cF,\cP)$. 
On the other hand, each step can be seen as a realization of a Langevin diffusion with a constant drift term
$ f_{\xi_k}(x_k)$:
\begin{equation}\label{eq:ld-stoch}
  \rmd y_t =  - f_{\xi_k}(x_k)\rmd t + \sqrt{2}\rmd B_t, 
\end{equation}
 with  $y_0 = x_k$ { and } $t \in [0,\gamma]$. 
Indeed,
\begin{equation*}
  \begin{aligned}
    y_\gamma 
    &= y_0 - \int_0^\gamma f_{\xi_k}(y_0)\rmd t + \sqrt{2}(B_\gamma - B_0)\\
    &= x_k -  \gamma f_{\xi_k}(x_k) + \sqrt{2\gamma}Z_1 = x_{k+1}.
  \end{aligned}
\end{equation*}
The interpolation method is based on analyzing the Fokker-Planck equation of this diffusion.
In particular, we will upper bound the time derivative of $\KL{\rho_t}$:
\begin{equation}
    \begin{aligned}
      \dif{\KL{\rho_t}}
      &=\int_{\RR^d} \frac{\partial \rho_{t}(z)}{\partial t} \log\brr{\frac{\rho_t}{\pi}}(z)\rmd z.
    \end{aligned}
  \end{equation}
Here, the first term of the product under the integral can be computed using the abovementioned Fokker-Planck equation. 
The following lemma is the cornerstone of our analysis. 
\begin{lemma}\label{lem:fp-transformation}
If $y_t$ is the solution of the diffusion \eqref{eq:ld-stoch} and $\rho_t = \cL(y_t)$, then for 
every $t \in [0,\gamma]$,
  \begin{equation}\label{eq:ef21-lmc-tytytyti}
    \dif{\KL{\rho_t}} \leq -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{f_{\xi_k}(y_0)-\nabla F(y_t)}}.
  \end{equation}
\end{lemma}
The bound \eqref{eq:ef21-lmc-tytytyti} was initially derived by \citet{vempala2019rapid} for the standard Langevin Monte-Carlo. 
Its current stochastic form was later proved in \citep{marina-langevin} for MARINA Langevin algorithm.
The proof is postponed to \Cref{sec:lem-fp-trans}.

Lemma~\ref{lem:fp-transformation} is valid for all our algorithms. 
We then insert the value of the gradient estimator for each method and bound the last term by 
$\bG^{\rm D}_k$. Using the recurrent properties of the Lyapunov terms and replacing 
Fisher information term by Kullback-Leibler divergence with LSI
inequality we conclude the proof. 

\subsection{Some technical lemmas}

We will use repeatedly, sometimes without even mentioning, a simple inequality  which is a consequence of Young's inequality.
It goes as follows.
\begin{lemma}
  For any two vectors $x,y \in \RR^d$ and any $s > 0$
  \begin{equation*}
    \normsq{x+y} \leq  (1 + s) \normsq{x} + ( 1 + s^{-1}) \normsq{y}.
  \end{equation*}
\end{lemma}
\begin{proof}
  \begin{equation*}
    \begin{aligned}
      \normsq{x+y} &=  \normsq{x} + 2\inner{x}{y} +  \normsq{y}\\
      &\leq  (1 + s) \normsq{x} + ( 1 + s^{-1}) \normsq{y}.
    \end{aligned}
  \end{equation*}
  The second passage is due to Young's inequality.
\end{proof}

We also use two lemmas from the literature, which we present below without proofs. 
The first one is an instance of Grönwall's inequality in its integral form. 
Its proof can be found in \citep{amann2011ordinary}.
\begin{lemma}[Grönwall's Inequality]\label{lem:gronwall}
  Assume $\phi, B:[0, T] \rightarrow \mathbb{R}$ are bounded non-negative measurable function and $C:[0, T] \rightarrow \mathbb{R}$ is a non-negative integrable function with the property that
  \begin{equation}
    \label{eq:grrrr1}
    \phi(t) \leq B(t)+\int_{0}^{t} C(\tau) \phi(\tau) \rmd \tau \quad \text { for all } t \in[0, T].
  \end{equation}
  Then
  \begin{equation*}
    \label{eq:grrrr2}
    \phi(t) \leq B(t)+\int_{0}^{t} B(s) C(s) \exp \left(\int_{s}^{t} C(\tau) \rmd \tau\right) \rmd s \quad \text { for all } t \in[0, T].
  \end{equation*}
\end{lemma}

The second is a technical lemma borrowed from \citet{chewi2021analysis}.
\begin{lemma}
  \label{lem:Chew}
  Suppose that $\nabla F$ is $L$-Lipschitz. Then for any probability measure $\nu$, the following inequality is satisfied:  
  \begin{equation*}
    \label{eq:chew}
    \mathbb{E}_{\nu}\left[\|\nabla F\|^{2}\right] \leq \mathbb{E}_{\nu}\left[\left\|\nabla \log \left( \frac{\nu}{ \pi}\right)\right\|^{2}\right]+2 d L =\FS{\nu}+2dL.
  \end{equation*}
\end{lemma}

\subsection{Proof of Theorem \ref{thm:ef21-lmc}}\label{sec:proof_ef21-lmc}

We follow the scheme described in \Cref{sec:proof-scheme}. 
Let us recall the initial setting first.
The update rule of both D-ELF and P-ELF can be abstractly defined by
\begin{equation*}
  x_{k+1} = x_k - \gamma g_k + \sqrt{2\gamma} Z_k.
\end{equation*}
The vector $g_k$ is a stochastic estimator of the potential function's gradient at the $k$-th iterate: 
$\nabla F(x_k)$. 
On the other hand, for each $k$ the next iteration can be computed using the following 
SDE:
\begin{equation}
  \rmd y_t =  - g_k\rmd t + \sqrt{2}\rmd B_t, 
\end{equation}  
with  $y_0 = x_k$ and  $t \in [0,\gamma]$.
Then, as shown in \Cref{sec:proof-scheme}, $y_{\gamma} = x_{k+1}$.  
Denote by $\rho_t$ the distribution of $y_t$. Lemma~\ref{lem:fp-transformation} yields:
  \begin{equation}\label{eq:lem1-proof}
  \begin{aligned}
    \dif{\KL{\rho_t}} 
    &\leq -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{f_{\xi_k}(y_0)-\nabla F(y_t)}}\\
    &\leq -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{g_k-\nabla F(y_t)}}.
  \end{aligned}
  \end{equation}

  \paragraph{The proof for D-ELF:} 
  The Lyapunov term for  the D-ELF algorithm is defined as
\begin{equation*}
    \bG^{\rm D}_k := \frac{1}{n} \sum_{i}^{n} \EE\brs{\norm{g^i_k - \nabla F_i(x_k)}^2}.  
\end{equation*}

Next lemma bounds the second term in \eqref{eq:lem1-proof} using $\bG^{\rm D}_k$.

\begin{lemma}\label{lem:dif-kl-fi}
  If $f_{\xi_k}(x_k)$ is the gradient estimator $g_k$ from \Cref{alg:ef21_langevin}, then 
  $\rho_t$ satisfies
  \begin{equation}
    \label{eq:ef21-lmc-finahave}
    \dif{\KL{\rho_t}}\leq -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2\bG^{\rm D}_k.
  \end{equation} 
\end{lemma}



Let us now add $C\bG^{\rm D}_{k+1}$ to both sides of the inequality \eqref{eq:ef21-lmc-finahave}, where $C > 0$ is a constant to be determined later:
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} + C   \bG^{\rm D}_{k+1}
    &\leq -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2 \bG^{\rm D}_k+C  \bG^{\rm D}_{k+1}.
  \end{aligned}
\end{equation*}
Combining Proposition~\ref{prop:uplink} and Lemma~\ref{lem:dif-kl-fi} we deduce
\begin{equation*}
  \begin{aligned}
  \dif{\KL{\rho_t}} + C \bG^{\rm D}_{k+1}
    &\leq-\frac{3}{4}\FS{\rho_t} + 2L^2\Exp{\normsq{x_{k+1}-x_k}}+2 \bG^{\rm D}_k\\
    &+C \left((1-p)\bG^{\rm D}_k+(1-p){ }\beta_{\rm D}\Exp{\normsq{x_{k+1}-x_k}}\right)\\
    &=-\frac{3}{4}\FS{\rho_t}+\left(2L^2+C(1-p) \beta_{\rm D}\right)\Exp{\normsq{x_{k+1}-x_k}}+\left(2+C(1-p)\right) \bG^{\rm D}_k.
   \end{aligned}
\end{equation*}
The lemma below bounds the term $\Exp{\normsq{x_{k+1}-x_k}}$.
\begin{lemma}\label{lem:exp_x_k+1_x_k}
  If $\gamma\leq\frac{1}{2\sqrt{2}L}$, then the iterates of the stochastic LMC algorithm \eqref{eq:stoch-lmc} satisfy the following inequality, where $\bG^{\rm D}_k$ is the Lyapunov term of D-ELF algorithm defined in \eqref{eq:gk-ef21}: 
  \begin{equation}
    \label{eq:ef21-lmc-kk+1}
    \Exp{\normsq{x_{k+1}-x_k}}\leq 8\gamma^2\Exp{\normsq{\nabla F(y_t)}}+4\gamma^2 \bG^{\rm D}_k+4d\gamma.
  \end{equation}
  \end{lemma}

Lemma~\ref{lem:exp_x_k+1_x_k} yields the following
\begin{equation*}
  \begin{aligned}
  \dif{\KL{\rho_t}} + C \bG^{\rm D}_{k+1}
    &{\leq}-\frac{3}{4}\FS{\rho_t}
    +\left(2L^2+C(1-p) \beta_{\rm D}\right)\left(8\gamma^2\Exp{\normsq{\nabla F(y_t)}}+4\gamma^2 \bG^{\rm D}_k 
    + 4d\gamma\right)\\
    &\quad+\left(2+C(1-p)\right) \bG^{\rm D}_k.
  \end{aligned}
\end{equation*}

Let us now apply Lemma~\ref{lem:Chew} to the right-hand side. We obtain
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} + C  \bG^{\rm D}_{k+1}
    &\leq-\frac{3}{4}\FS{\rho_t}+\left(2L^2+C(1-p) \beta_{\rm D}\right)
    \left(8\gamma^2\left(\FS{\rho_t}+2dL\right)+4\gamma^2\bG^{\rm D}_k+4d\gamma\right)\\
    &\quad+\left(2+C(1-p)\right)\bG^{\rm D}_k\\
    &=-\left(\frac{3}{4}-8\gamma^2\left(2L^2+C(1-p) \beta_{\rm D}\right)\right)\FS{\rho_t}\\
    & \quad +\left(8L^2\gamma^2+C(1-p)\left(4 \gamma^2\beta_{\rm D}+1\right)+2\right)\bG^{\rm D}_k\\
    &\quad+{\left(2L^2+C(1-p) \beta_{\rm D}\right)\left(16L\gamma^2d+4d\gamma\right)}.\\
   \end{aligned}
  \end{equation*}
  From the definition of $\tau$ we obtain the following:
  \begin{equation}
    \begin{aligned}
      \label{eq:ef21-lmc-jp}
      \dif{\KL{\rho_t}} + C  \bG^{\rm D}_{k+1}  
      &\leq  -\left(\frac{3}{4}-8\gamma^2\left(2L^2+C(1-p) \beta_{\rm D}\right)\right)\FS{\rho_t}\\
      &\quad+\left(8L^2\gamma^2+C(1-p)\left(4 \gamma^2\beta_{\rm D}+1\right)+2\right)\bG^{\rm D}_k+\tau.\\
    \end{aligned}
  \end{equation}
Let $C=\left(8L^2\gamma^2+C(1-p)\left(4 \gamma^2\beta_{\rm D}+1\right)+2\right)e^{\mu \gamma}$.  
Solving this linear equation w.r.t. $C$, we get
\begin{equation}
  \label{eq:ef21-lmc-newC}
  C=\frac{8L^2\gamma^2 + 2}{e^{-\mu \gamma}-(1-p)\left(4 \gamma^2\beta_{\rm D}+1\right)}.
\end{equation}


Without loss of generality we may assume that  $\mu \gamma <1 $  and thus we have $e^{\mu \gamma}\leq 1+2\mu \gamma$. 
In order for $C$ to be positive, we need to assure that
\begin{equation*}
  1-(1-p)\left(4 \beta_{\rm D} \gamma^2+1\right)\left(1+2\mu \gamma\right)>0.
\end{equation*}
The latter is equivalent to 
\begin{equation*}
  {\frac{1-p}{p}8 \mu\beta_{\rm D} \gamma^3} + {\frac{1-p}{p}4 \beta_{\rm D} \gamma^2} + {\frac{1-p}{p}2\mu \gamma} <1.
\end{equation*}
A simple solution to this inequality is to make all three terms smaller than $1/3$.
The latter is equivalent to
\begin{equation}\label{old_re:ree}
  \gamma< \min \left\{\left(\frac{p}{24 \mu\beta_{\rm D}(1-p)}\right)^{1/3}, \left(\frac{p}{12 \beta_{\rm D} (1-p)}\right)^{1/2},\frac{p}{6\mu(1-p)} \right\}.
\end{equation}
On the other hand, we will require the coefficient of $\FS{\rho_t}$ in \eqref{eq:ef21-lmc-jp} to be negative. This is to ensure contraction. That means
\begin{equation*}
  8\gamma^2\left(2L^2 + C(1-p)  \beta_{\rm D}\right)=8\gamma^2\left(2L^2+\frac{(8L^2\gamma^2 + 2) (1-p)  \beta_{\rm D}}{e^{-\mu \gamma}-(1-p)\left(4 \gamma^2\beta_{\rm D}+1\right)}\right)\leq\frac{1}{4}.
\end{equation*}
Solving this inequality we get 
\begin{equation}\label{eq:ef21-lmc-hbounddddd}
  \gamma\leq\frac{1}{2}\sqrt{\frac{1-(1-p)e^{\mu \gamma}}{ \brr{16+(1-p)(17\beta_{\rm D}-16)e^{\mu \gamma}}}}.
\end{equation}

From \eqref{old_re:ree}, we know that $\gamma< \frac{p}{6\mu(1-p)}$, so $e^{\mu \gamma}\leq 1+2\mu \gamma\leq
1+\frac{p}{3(1-p)}$. Inserting this upper bound  into \eqref{eq:ef21-lmc-hbounddddd}, we get a lower bound on the right hand side. That is 
\begin{equation*}
  \begin{aligned}
    \frac{1}{2}\sqrt{\frac{2p}{ \brs{17\beta_{\rm D}(3-2p)+32p}}}
    &=\frac{1}{2}\sqrt{\frac{1-(1-p)(1+\frac{p}{3(1-p)})}{  \brr{16+(1-p)(17\beta_{\rm D}-16)(1+\frac{p}{3(1-p)})}}}\\
    &\leq\frac{1}{2}\sqrt{\frac{1-(1-p)e^{\mu \gamma}}{ \brr{16+(1-p)(17\beta_{\rm D}-16)e^{\mu \gamma}}}}.
  \end{aligned}
\end{equation*}
So we need 
\begin{equation*} 
    \gamma<\min \left\{\frac{1}{2}\sqrt{\frac{2p}{ \brs{17\beta_{\rm D}(3-2p)+32p}}},\left(\frac{p}{24 \mu\beta_{\rm D}(1-p)}\right)^{1/3}, \left(\frac{p}{12 \beta_{\rm D} (1-p)}\right)^{1/2},\frac{p}{6\mu(1-p)}  \right\}.
\end{equation*}

We can further simplify this inequality. 
The first and third terms  are larger than $a:=\frac{1}{14}\sqrt{\frac{p}{ (1+\beta_{\rm D})}},$ while as the fourth term is larger than $b := \frac{p}{6\mu}$.  
On the other hand, $\min\{a,b\}$ is less than the second term. 
Indeed,
\begin{equation*}
  \min\{a,b\}\leq a^{2/3}b^{1/3}=\left(\frac{p^2}{1176 \mu(1+\beta_{\rm D})}\right)^{1/3}
  \leq \left(\frac{p}{24 \mu\beta_{\rm D}(1-p)}\right)^{1/3}.
\end{equation*}

Summing up, we obtain the following bound on the step-size that guarantees $C\geq0$ and \eqref{eq:ef21-lmc-hbounddddd}:
\begin{equation*}
    \gamma\leq\min \left\{\frac{1}{14}\sqrt{\frac{p}{ (1+\beta_{\rm D})}},\frac{p}{6\mu}\right\}.
\end{equation*}
Therefore, the above the conditions are satisfies. This yields the following:
\begin{equation}
  \label{eq:ef21-lmc-finnnnn}
  \dif{\KL{\rho_t}}+C\bG^{\rm D}_{k+1}\leq -\frac{1}{2}\FS{\rho_t} + e^{-\mu \gamma} C\bG^{\rm D}_k 
  + C\tau.
\end{equation}

Since $\pi$ satisfies Log-Sobolev inequality, we deduce
\begin{equation}
  \label{eq:ef21-lmc-LSILA}
  \dif{\KL{\rho_t}}+C\bG^{\rm D}_{k+1}\leq-\mu\KL{\rho_t}+e^{-\mu \gamma}C\bG^{\rm D}_k+C \tau.
\end{equation}
One may check that the  equivalent integral form of \eqref{eq:ef21-lmc-LSILA} 
 satisfies \eqref{eq:grrrr1} with $\phi(t)=\KL{\rho_t},~B(t)=\left(e^{- \mu \gamma}C\bG^{\rm D}_k-C\bG^{\rm D}_{k+1}+\tau\right)t+\KL{\rho_{k\gamma}},~C(t)=-\mu$. Therefore,  from Lemma~\ref{lem:gronwall} we deduce 
\begin{equation*}
  \label{old_eq;KLKLKL}
  \KL{\rho_t}\leq e^{-\mu t}\KL{\rho_{k\gamma}}+\frac{1-e^{-\mu t}}{\mu}\left(e^{-\mu \gamma}C\bG^{\rm D}_k-C\bG^{\rm D}_{k+1}+C\tau\right),
\end{equation*}
let $t=\gamma$ and $\beta=e^{\mu \gamma}$, then we have
\begin{equation}
  \begin{aligned}
    \KL{\rho_{(k+1)\gamma}}+\frac{1-e^{-\mu \gamma}}{\mu}C\bG^{\rm D}_{k+1}&\leq e^{-\mu \gamma}\left(\KL{\rho_{k\gamma}}+e^{\mu \gamma}\frac{1-e^{-\mu \gamma}}{\mu}\beta^{-1}C\bG^{\rm D}_k\right)+\frac{1-e^{-\mu \gamma}}{\mu}C\tau\\
    &=e^{-\mu \gamma}\left(\KL{\rho_{k\gamma}}+\frac{1-e^{-\mu \gamma}}{\mu}C\bG^{\rm D}_{k}\right)+\frac{1-e^{-\mu \gamma}}{\mu}C\tau.
  \end{aligned}
\end{equation} 

Repeating this step for $k=0,1,2,\cdots,K-1$, we obtain 
\begin{equation*}
    \begin{aligned}
      \bH_K &\leq e^{- K \mu \gamma}\bH_0 + \frac{1-e^{-K\mu \gamma}}{\mu}\tau.
    \end{aligned}
\end{equation*}
This proves Theorem~\ref{thm:ef21-lmc} for D-ELF.

\paragraph{The proof for P-ELF:} 

The gradient estimator $\nabla f_{\xi_k}(x_k)$ in this case is equal to 
\begin{equation*}
  \nabla f_{\xi_k}(x_k) = \nabla F(w_k) =\frac{1}{n} \sum_{i=1}^n \nabla F_i(w_k). 
\end{equation*}
From $L_i$-smoothness of the $i$-th component function $F_i$ we deduce the following relation:
\begin{equation} \label{eq:gd-gp}
  \begin{aligned}
  \bG^{\rm D}_k &=   \frac{1}{n} \sum_{i}^{n} \EE\brs{\norm{\nabla F_i(w_k) - \nabla F_i(x_k)}^2}\\
   & \leq  \frac{1}{n} \sum_{i}^{n} \EE\brs{L_i^2 \norm{ w_k - x_k}^2}\\
   & = \bG^{\rm P}_k.
  \end{aligned}
\end{equation}
Therefore, combining this inequality with Lemma~\ref{lem:dif-kl-fi} we obtain
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} 
    &\leq -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2\bG^{\rm D}_k \\
    &\leq -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2  \bG^{\rm P}_k.
  \end{aligned}
\end{equation*}
The latter means that we can repeat exactly the rest of the proof of D-ELF by replacing $\bG^{\rm D}_k$ with $ \bG^{\rm P}_k$ and using Proposition~\ref{prop:downlink} instead of Proposition~\ref{prop:uplink}.
Therefore, 
\begin{equation*}
    \begin{aligned}
      \bH_K &\leq e^{- K \mu \gamma}\bH_0 + \frac{1-e^{-K\mu \gamma}}{\mu}\tau.
    \end{aligned}
\end{equation*}
This concludes the proof of Theorem~\ref{thm:ef21-lmc}.



\subsection{Proof of Theorem~\ref{thm:bidir}}\label{sec:proof_bidir}
We recall the definition of the Lyapunov term $\bG^{\rm D}_k$:
\begin{equation*}
  \bG^{\rm D}_k := \frac{1}{n} \sum_{i}^{n} \EE\brs{\norm{g^i_k - \nabla F_i(x_k)}^2}.
\end{equation*}

As described in \Cref{sec:proof-scheme}, we use the interpolation proof scheme.
That is for the $k$-th iteration we define the process $y_t$ as in \eqref{eq:ld-stoch}.
Thus, from Lemma~\ref{lem:fp-transformation} we have
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} 
     & \leq -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{f_{\xi_k}(y_0)-\nabla F(y_t)}}
    \\ & = -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{ g_0 - \nabla F(y_t)}}.
  \end{aligned}
\end{equation*}

Combining this with Proposition~\ref{prop:bidir} and \eqref{eq:tk-rec}, we obtain
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}
    \\ & \leq 
    -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2 \bG^{\rm D}_k+C  \bG^{\rm D}_{k+1}  + D \bG^{\rm P}_{k+1}
    \\ & \leq 
    -\frac{3}{4}\FS{\rho_t}+2L^2\Exp{\normsq{x_{k+1}-x_k}}+2 \bG^{\rm D}_k+C \brr{\lambda_1 \bG^{\rm D}_k + \lambda_2 \EE\brs{\norm{ x_{k}-  x_{k+1}}^2} + \lambda_3 \bG^{\rm P}_k } 
    \\ & + D \brr{(1-\alpha_{\rm P}) (1+w)\bG^{\rm P}_k
              + (1-\alpha_{\rm P}) (1 + w^{-1})\bar{L}\EE\brs{\norm{  x_{k} -  x_{k+1}}^2} }
    \\ & = 
    -\frac{3}{4}\FS{\rho_t} + \brr{2L^2 + C\lambda_2 + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}\EE\brs{\norm{  x_{k} -  x_{k+1}}^2}
     \\ & + (2 + C\lambda_1)\bG^{\rm D}_k  + \brr{C\lambda_3 +  D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k.
      \end{aligned}
\end{equation*}
Lemma~\ref{lem:exp_x_k+1_x_k} yields
\begin{equation*}
  \Exp{\normsq{x_{k+1}-x_k}}\leq 8\gamma^2\Exp{\normsq{\nabla F(y_t)}}+4\gamma^2 \bG^{\rm D}_k + 4d\gamma,
\end{equation*}
for $\gamma < \nicefrac{1}{8L}$. 
The latter condition on the step-size is a consequence of our assumptions from the statement of Theorem~\ref{thm:bidir}. 
Therefore,
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}
     \\ & \leq 
    -\frac{3}{4}\FS{\rho_t} + \brr{2L^2 + C\lambda_2 + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}\brr{8\gamma^2\Exp{\normsq{\nabla F(y_t)}}+4\gamma^2 \bG^{\rm D}_k + 4d\gamma}
     \\ & + (2 + C\lambda_1)\bG^{\rm D}_k  + \brr{C\lambda_3 +  D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k.
  \end{aligned}
\end{equation*}

Applying Lemma~\ref{lem:Chew} we deduce 
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}
     \\ & \leq 
    -\frac{3}{4}\FS{\rho_t} + \brr{2L^2+ C\lambda_2  + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}\brr{8\gamma^2\brs{\FS{\rho_t}+2dL}+4\gamma^2 \bG^{\rm D}_k + 4d\gamma}
     \\ & + (2 + C\lambda_1)\bG^{\rm D}_k  + \brr{C\lambda_3 +  D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k
    \\ & =
    \brr{-\frac{3}{4} + 8\gamma^2 \brr{2L^2+ C\lambda_2  + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}} }\FS{\rho_t} 
     \\ & + \brc{2 + C\lambda_1 + 4\gamma^2\brr{2L^2 + C\lambda_2  + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}}\bG^{\rm D}_k + \brr{C\lambda_3 + D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k
      \\ & + {\brr{2L^2 + C\lambda_2  + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}\brr{16\gamma^2{dL}+ 4d\gamma}}.
  \end{aligned}
\end{equation*}
Let us choose $C$ and $D$ to satisfy 
\begin{equation}\label{eq:CD-def}
  \begin{aligned}
    C &= \frac{2.125}{e^{-\mu \gamma} -\lambda_1} \quad \text{and} \quad
    D =\frac{ {2.125}\lambda_3}{ \brr{e^{-\mu \gamma} -\lambda_1}\brr{e^{-\mu \gamma}  - (1-\alpha_{\rm P}) (1+w) }},
  \end{aligned}
\end{equation}
where $\mu$ is the constant from Log-Sobolev inequality.
In order for $C$ and $D$ to be positive we need $\lambda_1$ and  $(1-\alpha_{\rm P}) (1+w)$ to be smaller than  $e^{-\mu\gamma}$. 
We will choose $w$ and  $q = s$ as solutions to the following equations:
\begin{equation}\label{eq:qw}
   \begin{aligned}
      \lambda_1 = (1 - \alpha_{\rm D})(1 + q)^2 &= 1 - \frac{\alpha_{\rm D}}{2}; \\ 
      (1-\alpha_{\rm P}) (1+w) &= 1 - \frac{\alpha_{\rm P}}{2}.
   \end{aligned}
 \end{equation} 
 Then, 
 \begin{equation}\label{eq:gamma-alpha}
   e^{-\mu\gamma} > 1 - \mu\gamma > \max\brc{1 - \nicefrac{\alpha_{\rm D}}{4},1 -\nicefrac{ \alpha_{\rm P}}{4}}
 \end{equation} thus the denominators are positive. 
 Furthermore, 
 \begin{equation*}
   D =\frac{ {2.125}\lambda_3}{ \brr{e^{-\mu \gamma} -\lambda_1}\brr{e^{-\mu \gamma}  - (1-\alpha_{\rm P}) (1+w) }}
   \leq \frac{4C\lambda_3}{\alpha_{\rm P}}.
 \end{equation*}
 Recall that the definitions of $\lambda_2 $ and $\lambda_3$ are given in \eqref{eq:lambda-def}.
    Since $(1-\alpha_{\rm P}) (1 + w) < 1$, from the definition of $\lambda_3$ we have 
     \begin{equation*}
      \begin{aligned}
        \lambda_3
        &= \brr{2(1-\alpha_{\rm D})(1+q)(1+q^{-1}) + (1+q^{-1})} (1-\alpha_{\rm P}) (1 + w)\\
        &\leq { \brr{2(1-\alpha_{\rm D})(2+q+q^{-1}) + (1+q^{-1})} (1-\alpha_{\rm P}) (1 + w)} \\
        &\leq  \brr{2(1-\alpha_{\rm D})(2+q+q^{-1}) + (1+q^{-1})}.
      \end{aligned}
    \end{equation*}
    Therefore, \eqref{eq:lambda-def} implies
     \begin{equation*}
      \begin{aligned}
        \lambda_3(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L} 
        &= \brr{2(1-\alpha_{\rm D})(2+q+q^{-1}) + (1+q^{-1})} (1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L} \leq
        \lambda_2.
      \end{aligned}
    \end{equation*}
    Thus, 
    \begin{equation*}
    \begin{aligned}
      \gamma^2 \brr{2L^2 + C\lambda_2 + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}
      &\leq   \gamma^2 \brr{2L^2 + C\lambda_2 + \frac{4C\lambda_3}{\alpha_{\rm P}} (1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}
       \\
      & \leq \gamma^2 \brr{2L^2 + C\lambda_2 +   \frac{4C\lambda_2}{\alpha_{\rm P}}}\\
      & \leq \gamma^2 \brr{2L^2 +   \frac{5C\lambda_2}{\alpha_{\rm P}}}.
    \end{aligned}
  \end{equation*}
  The next lemma bounds the right hand side of the previous inequality by a constant. 
  This will allow us to get a negative coefficient for the $\FS{\rho_t}$ term. 
\begin{lemma}\label{lem:gamma-cond}
  Suppose $u=1$, $q = s$, $C$ and $D$ are defined as in \eqref{eq:CD-def}. Let  \eqref{eq:qw} and \eqref{eq:gamma-alpha}
  also be true.
  Under the assumptions of Theorem~\ref{thm:bidir}, the step-size satisfies the following inequality:  
  \begin{equation*}
    \gamma^2 \brr{2L^2 +   \frac{5C\lambda_2}{\alpha_{\rm P}}} < \frac{1}{32}.    
  \end{equation*}
\end{lemma}
The proof is postponed to \Cref{proof:lem:gamma-cond}.
Applying Lemma~\ref{lem:gamma-cond} to the first term we finally obtain the following recurrent inequality
  \begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}\\
    & \leq -\frac{1}{2}\FS{\rho_t} 
    +  \brr{2.125 + C\lambda_1}\bG^{\rm D}_k + \brr{C\lambda_3 + D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k
      \\ & + {\brr{2L^2 + C\lambda_2  + D(1-\alpha_{\rm P}) (1 + w^{-1}) \bar{L}}\brr{16\gamma^2{dL}+ 4d\gamma}}\\
     & \leq -\frac{1}{2}\FS{\rho_t} 
    +  \brr{2.125 + C\lambda_1}\bG^{\rm D}_k + \brr{C\lambda_3 + D(1-\alpha_{\rm P}) (1+w)}\bG^{\rm P}_k
      \\ & + \underbrace{\brr{2L^2 +   \frac{5C\lambda_2}{\alpha_{\rm P}}}\brr{16\gamma^2{dL}+ 4d\gamma}}_{:=\tau}.
    \end{aligned}
\end{equation*}

Then, inserting the values of $C$ and $D$, we get
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}
       \leq 
    -\frac{1}{2} \FS{\rho_t} 
    + e^{-\mu \gamma} C \bG^{\rm D}_k + e^{-\mu \gamma} D\bG^{\rm P}_k + \tau.
  \end{aligned}
\end{equation*}
Let us now apply LSI: 
\begin{equation*}
  \begin{aligned}
    \dif{\KL{\rho_t}} &+ C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}& \leq - {\mu}\KL{\rho_t} + e^{-\mu \gamma} C \bG^{\rm D}_k + e^{-\mu \gamma} D\bG^{\rm P}_k + \tau.
  \end{aligned}
\end{equation*}
Hence, the derivative of the function $\KL{\rho_t}$ is bounded by itself plus a term that does not depend on $t$. Lemma~\ref{lem:gronwall} yields the following:
\begin{equation*}
  \begin{aligned}
    {\KL{\rho_t}} & \leq   e^{-\mu t}  \KL{\rho_0} 
    +\frac{1 - e^{-\mu t} }{\mu} \brr{e^{-\mu \gamma} C \bG^{\rm D}_k + e^{-\mu \gamma} D\bG^{\rm P}_k -  C \bG^{\rm D}_{k+1} - D \bG^{\rm P}_{k+1} + \tau}.
  \end{aligned}
\end{equation*}
In particular, for $t = \gamma$, we have
\begin{equation*}
  \begin{aligned}
    {\KL{\rho_{\gamma}}}  +\frac{1 - e^{-\mu \gamma} }{\mu}\brr{C \bG^{\rm D}_{k+1} + D \bG^{\rm P}_{k+1}} 
    & \leq   e^{-\mu \gamma}  \KL{\rho_0} 
    +\frac{1 - e^{-\mu \gamma} }{\mu} \brr{e^{-\mu \gamma} C \bG^{\rm D}_k + e^{-\mu \gamma} D\bG^{\rm P}_k  + \tau}
    \\ & =   e^{-\mu \gamma} \brs{ \KL{\rho_0} 
        +\frac{1 - e^{-\mu \gamma} }{\mu} \brr{  C \bG^{\rm D}_k +   D\bG^{\rm P}_k} } + \frac{1 - e^{-\mu \gamma} }{\mu}\tau.
  \end{aligned}
\end{equation*}
We first recall that $\rho_{\gamma} = \nu_{K+1}$ and $\rho_0 = \nu_K$. 
Repeating this inequality recurrently we deduce the following bound:
\begin{equation*}
  \begin{aligned}
    {\KL{\nu_K}}  +\frac{1 - e^{-\mu \gamma} }{\mu}\brr{C \bG^{\rm D}_{K} + D \bG^{\rm P}_{K}} 
    & \leq e^{-\mu \gamma K} \brs{ \KL{\rho_0} 
    +\frac{1 - e^{-\mu \gamma} }{\mu} \brr{  C \bG^{\rm D}_0 +   D\bG^{\rm P}_0} } + \frac{\tau }{\mu}.
  \end{aligned}
\end{equation*}
This concludes the proof of Theorem~\ref{thm:bidir}. 

\begin{remark}
  One may check, that repeating the analysis for the case when one of the compressor operators $(\alpha = 1)$ is the identity, we will recover the previously known algorithms.
\end{remark}

\subsection{Proof of Corrollary~\ref{corr:belf-conv}}\label{sec:proof-corr-belf}
  First let us upper bound $\tau$. Similar to the proof of Corrollary~\ref{corr:delf-conv}, $\brr{16\gamma^2{dL}+ 4d\gamma} < 5d\gamma$. 
  Thus,
  \begin{equation*}
    \begin{aligned}
       \tau 
       & \leq
       \brr{2L^2 + \frac{5C\lambda_2}{\alpha_{\rm P}}}5d\gamma
        \leq 
        \frac{45\lambda_2}{\alpha_{\rm D}\alpha_{\rm P}} { 5d\gamma} 
        \\ & =
        \cO\brr{   \frac{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}}}{qw{\alpha_{\rm D}\alpha_{\rm P}}\brr{1  - \alpha_{\rm P}}\brr{1  - \alpha_{\rm D}}} \bar{L}  d\gamma}
         \\ &  
         = \cO\brr{  \frac{\bar{L}  d\gamma}{qw{\alpha_{\rm D}\alpha_{\rm P}}} }.
    \end{aligned}
  \end{equation*}




\section{Proofs of the lemmas}



\subsection{Proof of Lemma~\ref{lem:fp-transformation}}\label{sec:lem-fp-trans}
   Let $\rho_{0t}$ denote the joint distribution of $\left(y_0,\xi,y_t\right)$, which we write in terms of the conditionals and marginals as
  \begin{equation*}
  \rho_{0 t}\left(z,y_{0},\xi\right)=\rho_{0}\left(y_{0},\xi\right) \rho_{t \mid 0}\left(z \mid y_{0},\xi\right)=\rho_{t}\left(z\right) \rho_{0 \mid t}\left(y_{0},\xi \mid z\right) .
  \end{equation*}
  Conditioning on $\left(y_{0},\xi\right)$, the drift vector field $f_{\xi_k}(y_0)$ is a constant, so the Fokker-Planck formula for the conditional density $\rho_{t \mid 0}\left(z \mid y_{0},\xi\right)$ is given by
  \begin{equation}\label{eq:derivative_of_rho_t|0}
    \frac{\partial \rho_{t \mid 0}\left(z \mid y_{0},\xi\right)}{\partial t}=\nabla_z \cdot\left(\rho_{t \mid 0}\left(z \mid y_{0},\xi\right) f_{\xi}\left(y_{0}\right)\right)+\Delta \rho_{t \mid 0}\left(z \mid y_{0},\xi \right).
  \end{equation}
  To derive the evolution of $\rho_{t}$, we integrate w.r.t. $\left(y_{0},\xi\right) \sim \rho_{0}$:
  \begin{equation}
    \begin{aligned}
      \frac{\partial \rho_{t}(z)}{\partial t} &=\int_{\mathbb{R}^{d}\times\Xi} \frac{\partial \rho_{t \mid 0}\left(z \mid y_{0},\xi\right)}{\partial t} \rho_{0}\left(y_{0},\xi\right) \rmd y_{0} \rmd \xi\\
      &\stackrel{\eqref{eq:derivative_of_rho_t|0}}{=}\int_{\mathbb{R}^{d}\times\Xi}\left(\nabla_z \cdot\left(\rho_{t \mid 0}\left(z \mid y_{0},\xi\right) f_{\xi}\left(y_{0}\right)\right)+\Delta \rho_{t \mid 0}\left(z \mid y_{0},\xi \right)\right) \rho_{0}\left(y_{0},\xi\right) \rmd y_{0} \rmd \xi.
      \end{aligned}
  \end{equation}
  Using the definition of conditional densities and Fubini's theorem we deduce
  \begin{equation}
    \begin{aligned}
       \frac{\partial \rho_{t}(z)}{\partial t} &=\int_{\mathbb{R}^{d}\times\Xi}\left(\nabla_z \cdot\left(\rho_{0t}\left(z, y_{0},\xi\right)  f_{\xi}\left(y_{0}\right)\right)+\Delta \rho_{0t}\left(z, y_{0},\xi\right)\right) \rmd y_{0} \rmd \xi \\
      &=\nabla_z \cdot\left(\rho_{t}(z) \int_{\mathbb{R}^{d}\times\Xi} \rho_{0 \mid t}\left(y_{0} ,\xi\mid z\right) f_{\xi}\left(y_{0}\right) \rmd y_{0} \rmd \xi\right)+\Delta \rho_{t}(z) \\
      &=\nabla_z \cdot\left(\rho_{t}(z) \mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]\right)+\Delta \rho_{t}(z).
    \end{aligned}
  \end{equation}
  Writing down the definition of {\sf KL} divergence and using Fubini's theorem, we deduce
  \begin{equation}
    \begin{aligned}
      \dif{\KL{\rho_t}}
      &=\int_{\RR^d} \frac{\partial \rho_{t}(z)}{\partial t} \log\brr{\frac{\rho_t}{\pi}}(z)\rmd z\\
      &=\int_{\RR^d}\left(\nabla_z \cdot\left(\rho_{t}(z) 
      \mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]\right)+\Delta \rho_{t}(z)\right)\log\brr{\frac{\rho_t}{\pi}}(z)\rmd z\\
      &=-\int_{\RR^d}\inner{\mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]+\nabla\log(\rho_t)(z)}{\nabla\log\brr{\frac{\rho_t}{\pi}}(z)}\rho_t(z)\rmd z\\
      &=-\int_{\RR^d}\Big({\nabla\log\brr{\frac{\rho_t}{\pi}}(z)-\nabla\log\brr{\frac{\rho_t}{\pi}}(z)+\mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]+\nabla\log(\rho_t)(z)}\Big)^{\top}\\ 
      & \hspace{2cm} \times
      {\nabla\log\brr{\frac{\rho_t}{\pi}}(z)}\rho_t(z)\rmd z\\
      &=-\int_{\RR^d}\inner{\nabla\log\brr{\frac{\rho_t}{\pi}}(z)+\mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]-\nabla F(z)}{\nabla\log\brr{\frac{\rho_t}{\pi}}(z)}\rho_t(z)\rmd z.\\
        \end{aligned}
  \end{equation}
  We recall the definition of Fisher information to bound the first term of the scalar product:
  \begin{equation}
    \begin{aligned} 
      \dif{\KL{\rho_t}}
      &\leq-\FS{\rho_t}-\int_{\RR^d}\inner{\mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]-\nabla F(z)}{\nabla\log\brr{\frac{\rho_t}{\pi}}(z)}\rho_t(z)\rmd z.
    \end{aligned}
  \end{equation}
  From the Cauchy-Schwartz inequality, we deduce 
  \begin{equation}
    \begin{aligned} 
      \dif{\KL{\rho_t}}
      &\leq -\FS{\rho_t}+\frac{1}{4}\FS{\rho_t} + 
      \int_{\RR^d}\norm{\mathbb{E}_{\rho_{0 \mid t}}\left[f_{\xi}\left(y_{0}\right) \mid y_{t}=z\right]-\nabla F(z)}^2\rho_t(z)\rmd z\\
      & = -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{\mathbb{E}\left[f_{\xi_k}(y_0)-\nabla F(y_t)\mid y_t\right]}}\\
      & \leq-\frac{3}{4}\FS{\rho_t}+\Exp{\mathbb{E}\left[\normsq{f_{\xi_k}(y_0)-\nabla F(y_t)}\mid y_t\right]}\\
      & = -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{f_{\xi_k}(y_0)-\nabla F(y_t)}}.
    \end{aligned}
  \end{equation}
    This  concludes the proof of the lemma.





  \subsection{Proof of Lemma~\ref{lem:dif-kl-fi}}
  If we replace $f_{\xi_k}(y_0)$ by $g_0$ in \eqref{eq:ef21-lmc-tytytyti},  we will have
  \begin{equation*}
    \begin{aligned}
      \dif{\KL{\rho_t}}&\leq -\frac{3}{4}\FS{\rho_t}+\Exp{\normsq{\nabla F(y_t)-g_0}}\\
      &\leq -\frac{3}{4}\FS{\rho_t}+2\Exp{\normsq{\nabla F(y_t)-\nabla F(y_0)}}+2\Exp{\normsq{\nabla F(x_0)-g_0}}\\
      &=-\frac{3}{4}\FS{\rho_t}+2{\Exp{\normsq{\nabla F(y_t)-\nabla F(x_0)}}}
      +2\Exp{\normsq{\frac{1}{n}\sum_{i=1}^{n} \brc{\nabla F_i(x_0)-g^i_0}}}\\
      &\leq-\frac{3}{4}\FS{\rho_t}+2{\Exp{\normsq{\nabla F(y_t)-\nabla F(x_0)}}}+2\bG^{\rm D}_0.
    \end{aligned}
  \end{equation*}
  Here the last implication is due to Jensen's inequality.
  Let us bound the second term. %, denote $\cF^k_t$ the filtration generated by $\{B_{s}\}_{s=0}^{k\gamma+t}$, then
  The smoothness of the gradient yields
  \begin{equation}
    \label{eq:grad_ytx0-p1}
    \begin{aligned}
      \mathbb{E}\left[\normsq{\nabla F(y_t)-\nabla F(x_0)}\right]
      \leq L^2\mathbb{E}\left[\normsq{ y_t-x_0}\right]=L^2\mathbb{E}\left[\normsq{t g_0+\sqrt{2}\left(B_{t}-B_{0}\right)}\right].
    \end{aligned}
  \end{equation}
  Since the Brownian process has independent increments we get
  \begin{equation}
    \label{eq:grad_ytx0-p2}
    \begin{aligned}
      \mathbb{E}\left[\normsq{\nabla F(y_t)-\nabla F(x_0)}\right]
     &\leq L^2t^2\normsq{g_0}+2tL^2d \\
     &\leq L^2\gamma^2\normsq{g_0}+2hL^2d\\
     &= L^2\mathbb{E}\left[\normsq{ x_{1}-x_0}\right].
    \end{aligned}
  \end{equation}
  This concludes the proof.



  \subsection{Proof of Lemma~\ref{lem:exp_x_k+1_x_k}}

Let us apply Lemma~\ref{lem:Chew} to bound the term $\Exp{\normsq{x_{k+1}-x_k}}$:
\begin{equation*}
  \begin{aligned}
    \Exp{\normsq{x_{k+1}-x_k}}&=\gamma^2\Exp{\normsq{g_k}}+2d\gamma\\
    & \leq 2\gamma^2\left(\Exp{\normsq{\nabla F(x_k)}} + \Exp{\normsq{\nabla F(x_k)-g_k}}\right)+2d\gamma\\
    & \leq 2\gamma^2 \Exp{\normsq{\nabla F(x_k)}}+2\gamma^2\bG^{\rm D}_k+2d\gamma\\
    & \leq 4\gamma^2\left(\Exp{\norm{\nabla F(y_t)}}+\Exp{\normsq{\nabla F(y_t)-\nabla F(x_k)}}\right)+2\gamma^2 
    \bG^{\rm D}_k+2d\gamma\\
    & \leq 4\gamma^2\Exp{\norm{\nabla F(y_t)}}+4L^2\gamma^2\Exp{\normsq{x_{t}-x_k}}+2\gamma^2\bG^{\rm D}_k+2d\gamma\\
    & \leq 4\gamma^2\Exp{\norm{\nabla F(y_t)}}+4L^2\gamma^2\Exp{\normsq{x_{k+1}-x_k}}+2\gamma^2\bG^{\rm D}_k+2d\gamma.
  \end{aligned}
\end{equation*}
Regrouping the terms we obtain 
\begin{equation*}
  \begin{aligned}
    (1 - 4L^2\gamma^2)\Exp{\normsq{x_{k+1}-x_k}}
    & \leq 4\gamma^2\Exp{\norm{\nabla F(y_t)}} + 2\gamma^2\bG^{\rm D}_k + 2d\gamma.
  \end{aligned}
\end{equation*}
Dividing both sides on $1 - 4L^2\gamma^2$ and recalling that $2\sqrt{2}L\gamma < 1$, we conclude the proof.


\subsection{Proof of Lemma~\ref{lem:gamma-cond}}\label{proof:lem:gamma-cond}

    Is sufficient to show that 
    \begin{equation*}
      \gamma^2 \leq \min\brc{\frac{1}{192 L^2 },   \frac{\alpha_{\rm P}}{240C\lambda_2}}.
    \end{equation*}
    From the assumption of the theorem, we know that $\gamma^2 \leq \frac{1}{192 L^2 }$. Thus it remains to show that $\gamma^2$ is bounded by the minimum of the other two terms:
    \begin{equation*}
      \begin{aligned}
        \gamma^2 
        &\leq \frac{\alpha_{\rm P}}{240 C\lambda_2} = \frac{\alpha_{\rm P}\brr{e^{-\mu \gamma} -\lambda_1}}{510\lambda_2}.
      \end{aligned}
    \end{equation*}

    Since $u = 1$ and $s=q$ we have the following bound on $\lambda_2$:
    \begin{equation*}
      \begin{aligned}
        \lambda_2 
        &\leq  \brs{2(1+q)(1+q^{-1})
             + \brr{2(1+q)(1+q^{-1})+ (1+q^{-1})} (1 + w^{-1})}\bar{L} \\
        &=  \brs{2(2+q+q^{-1})
             + \brr{2(2+q+q^{-1}) + (1+q^{-1})} (1 + w^{-1})}\bar{L} \\
        &= \frac{1}{q} \brs{2(2q+q^2+1)
             + \brr{2(2q+q^2+1) + (q+1)} (1 + w^{-1})}\bar{L} \\
        &\leq \frac{1}{qw} { {5(q+1)^2} (1 + w)}\bar{L} \\
        &\leq \frac{5}{qw}  \frac{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}}}{\brr{1  - \alpha_{\rm P}}\brr{1  - \alpha_{\rm D}}} \bar{L}. 
      \end{aligned}
    \end{equation*}

    Therefore, we have an upper bound on $\lambda_2$. This means that it is sufficient for us to prove 
     \begin{equation*}
      \begin{aligned}
        \gamma^2 
        &\leq \frac{\alpha_{\rm P}\brr{e^{-\mu \gamma} -\lambda_1}}{510\frac{5}{qw}  \frac{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}}}{\brr{1  - \alpha_{\rm P}}\brr{1  - \alpha_{\rm D}}} \bar{L}} 
        & = \frac{{qw}\alpha_{\rm P}\brr{e^{-\mu \gamma} -\lambda_1}}
        {2550 \bar{L}} 
        \cdot\frac{\brr{1  - \alpha_{\rm P}}\brr{1  - \alpha_{\rm D}}}{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}}}.
      \end{aligned}
    \end{equation*}
    From $\mu\gamma < \min\brc{\alpha_{\rm D},\alpha_{\rm P}}/4$ and $e^t > 1 + t$, we deduce
    $e^{-\mu \gamma} -\lambda_1 > \alpha_{\rm D}/4$.
    Combining these inequalities with \eqref{eq:qw}, we deduce that it is sufficient to prove
     \begin{equation*}
      \begin{aligned}
        \gamma^2 
        &\leq  \frac{qw\alpha_{\rm D}\alpha_{\rm P} \brr{1  - \alpha_{\rm P}}\brr{1  - \alpha_{\rm D}}}{10200\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}} \bar{L}}.
      \end{aligned}
    \end{equation*}
    Finally, using \eqref{eq:qw} once again, we derive 
    \begin{equation*}
      qw \geq \frac{\alpha_{\rm P}\alpha_{\rm D}}{24(1-\alpha_{\rm P})(1-\alpha_{\rm D})}.
    \end{equation*}
    Therefore, 
     \begin{equation*}
      \begin{aligned}
        \gamma^2 
        &\leq  \frac{\alpha_{\rm D}^2\alpha_{\rm P}^2 }{244800\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}} \bar{L}}.
      \end{aligned}
    \end{equation*}
    Taking square root on both sides we obtain
     \begin{equation*}
      \begin{aligned}
        \gamma 
        &\leq  \frac{\alpha_{\rm D}\alpha_{\rm P} }{495\sqrt{\brr{1-\frac{\alpha_{\rm D}}{2} }\brr{1-\frac{\alpha_{\rm P}}{2}} \bar{L}}}.
      \end{aligned}
    \end{equation*}
    This concludes the proof. 

  \section{Details on the experiments}
\label{app:experiments}



In this section, we describe the experimental setting in details. 
The code for the experiments can be found in \url{https://anonymous.4open.science/r/elf_code-DE51/README.md}. 

\subsection{The setting}
We are interested in the Bayesian logistic regression problem with a Gaussian prior. 
In particular, our goal is to sample from the posterior distribution, whose negative log-likelihood, that is the potential $f$, is given by 
\begin{equation*}
    F(x) = \frac{1}{n}\sum_{i=1}^{n}f_i(x); \qquad f_i(x) = \frac{1}{m_i}\sum_{j=1}^{m_i}\log\left(1 + e^{-b_{i, j}\cdot\inner{a_{i, j}}{x}}\right) + \frac{\lambda}{2} \norm{x}^2,
\end{equation*}
where $x\in\RR^d$ is the model, $\left(a_{i, j}, b_{i, j}\right) \in \RR^d \times \left\{-1, 1\right\}$ is one data point in the dataset of client $i$ whose size is $m_i$. 
Here, the coefficient $\lambda > 0$ is the inverse variance of the prior distribution.


The datasets used in this study are chosen from the LibSVM repository \citep{chang2011libsvm}. Specifically, we implement the {B, D, P}-ELF algorithms, along with the LMC algorithm for the aforementioned target, to solve a classification problem on the datasets \texttt{a8a, a9a}, and \texttt{mushrooms}.

For each dataset, we partition the data points into $40$ clients. 
Subsequently, we run all four methods with identical stepsizes selected from the set ${0.01, 0.1, 0.5}$. The compressor Top-$\tau$ is chosen for the ELF methods, where $\tau$ takes values from the set ${1, 5, 10, 50, 100}$. Given the stochastic nature of our algorithms, the final iterates are inherently random. 
To reduce variability in the finale estimate, we compute the average of the last $100$ iterates for each method.

Each plot in \Cref{fig:experiment} features the communication complexity on the $\rm X$-axis and the test accuracy on the ${\rm Y}$-axis. Remarkably, across all plots, despite conservative theoretical expectations, the performance of all four algorithms appears nearly equivalent.

\end{document}
