%%%%%%%% ICML 2023 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
\usepackage{icml2023}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}


\usepackage{changes}
\usepackage[makeroom]{cancel}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{subfigure}
\usepackage{hyperref}
\usepackage[capitalize,noabbrev]{cleveref}

\usepackage{algorithm}




\ifdefined\isarxiv
\usepackage{hyperref}
\hypersetup{colorlinks=true,citecolor=red,linkcolor=red}
\usepackage[margin=1in]{geometry}
\else

\definecolor{red}{rgb}{1.0, 0.0, 0.0}
\definecolor{darkblue}{rgb}{0.0, 0.0, 0.55}
\hypersetup{
  pdffitwindow=true,
  pdfstartview={FitH},
  pdfnewwindow=true,
  colorlinks,
  linktocpage=true,
  linkcolor=red,
  urlcolor=red,
  citecolor=darkblue
}
\fi


%%%Zhao: guys, please don't comment the following lines

\ifdefined\isarxiv
\usepackage[margin=1in]{geometry}
\else

\fi

\definecolor{b2}{RGB}{51,153,255}
\definecolor{mygreen}{RGB}{80,180,0}
\definecolor{yl}{RGB}{255,80,0}
\definecolor{myl}{RGB}{180,80,20}



\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{notation}[theorem]{Notation}
%\newtheorem{proof}[theorem]{Proof}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{example}[theorem]{Example}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{open}[theorem]{Open Problem}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem{question}[theorem]{Question}
\newtheorem{case}{Case}

\input{def}

\renewcommand{\tilde}{\widetilde}
\renewcommand{\hat}{\widehat}
\newcommand{\wt}{\widetilde}
\newcommand{\wh}{\widehat}
\newcommand{\ov}{\overline}
\newcommand{\GS}{\mathrm{GS}}
\newcommand{\ap}{\mathrm{ap}}
\renewcommand{\d}{\mathrm{d}}
\newcommand{\tr}{\mathrm{tr}}
\DeclareMathOperator{\poly}{poly}
\DeclareMathOperator{\sparse}{sparse}
\DeclareMathOperator{\Z}{{\mathbb Z}}
% \DeclareMathOperator{\R}{{\mathbb R}}
\DeclareMathOperator{\C}{{\mathbb C}}
\DeclareMathOperator{\D}{{\mathcal D}}
\DeclareMathOperator{\cS}{{\mathcal S}}
\DeclareMathOperator{\M}{{\mathcal M}}
 % \DeclareMathOperator*{\E}{{\mathbb{E}}} %%% Zhao: I'm not sure whow comment out this before.
\DeclareMathOperator*{\var}{\mathrm{Var}}
% \DeclareMathOperator*{\Var}{\mathrm{Var}}
% \DeclareMathOperator{\sign}{sign}
% \DeclareMathOperator*{\argmax}{arg\,max}
% \DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\bbeta}{{\boldsymbol \beta}}
\newcommand{\loc}{\mathrm{local}}
\newcommand{\glo}{\mathrm{global}}
\newcommand{\W}{\mathcal{W}}
%\newcommand{\R}{\mathbb{R}}
\DeclareMathOperator{\vect}{vec}

\newcommand{\N}{\mathcal{N}}
\DeclareMathOperator{\dis}{dis}
\DeclareMathOperator{\cts}{cts}
\newcommand{\RHS}{\mathrm{RHS}}
\newcommand{\LHS}{\mathrm{LHS}}

\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\newcommand{\dist}{\mathrm{dist}}%

\newcommand{\bnabla}{{\boldsymbol \nabla}}
\newcommand{\btheta}{{\boldsymbol \theta}}

\newcommand{\Zhao}[1]{{\color{mygreen}[Zhao: #1]}}
\newcommand{\Yian}[1]{{\color{red}[Yian: #1]}}
\newcommand{\Guang}[1]{{\color{b2}[Guang: #1]}}
\newcommand{\Wei}[1]{{\color{orange}[Wei: #1]}} 
\newcommand{\Qian}[1]{{\color{blue}[Qian: #1]}} 

\newcommand{\fedavg}{{\texttt{FedAvg}}} % Macro for the method


\newcommand{\footremember}[2]{%
    \footnote{#2}
    \newcounter{#1}
    \setcounter{#1}{\value{footnote}}%
}
\newcommand{\footrecall}[1]{%
    \footnotemark[\value{#1}]%
} 




% \makeatletter
% \def\tagform@#1{\maketag@@@{[\ignorespaces#1\unskip\@@italiccorr]}}
% \makeatother

\let\originaleqref=\ref
\renewcommand{\eqref}{\originaleqref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \theoremstyle{plain}
% \newtheorem{theorem}{Theorem}[section]
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
% \newtheorem{definition}[theorem]{Definition}
% \newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
% \newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
% \usepackage[textsize=tiny]{todonotes}


% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\icmltitlerunning{On Convergence of Federated Averaging Langevin Dynamics}

\begin{document}

\twocolumn[
\icmltitle{On Convergence of Federated Averaging Langevin Dynamics}

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2023
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
\icmlauthor{Firstname1 Lastname1}{equal,yyy}
\icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
\icmlauthor{Firstname3 Lastname3}{comp}
\icmlauthor{Firstname4 Lastname4}{sch}
\icmlauthor{Firstname5 Lastname5}{yyy}
\icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
\icmlauthor{Firstname7 Lastname7}{comp}
%\icmlauthor{}{sch}
\icmlauthor{Firstname8 Lastname8}{sch}
\icmlauthor{Firstname8 Lastname8}{yyy,comp}
%\icmlauthor{}{sch}
%\icmlauthor{}{sch}
\end{icmlauthorlist}

\icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
\icmlaffiliation{comp}{Company Name, Location, Country}
\icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

\icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
\icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
\printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.

\section{Response to y47s}

We appreciate the valuable comments.

\textbf{The strong convexity is too strong and dramatically hurts the significance of the paper.}

The strong convexity assumption is quite important in the understanding of Langevin Monte Carlo (or variants) algorithms under different scenarios [1,2,3] and there is no prior work discussing the convergence in federated learning. To fill this gap, we initiated the study of Bayesian federated learning with strong convexity assumptions.

We acknowledge that the convergence analysis in non-convex scenarios is also important and is a natural follow-up extension of our work, however, the dissipative condition used in [4] (or conditions on LSI) basically constrains the analysis to approximately unimodal distributions and suffers from the exponential dependence on the radius of non-convex regions.

% We acknowledge that the convergence analysis in non-convex scenarios is also important, however, it suffers from the exponential dependence on parameters and doesn't shed much light on the training complexity of Bayesian federated learning in practice. Moreover, the dissipative condition used in [4] (or conditions on LSI) basically constrains the analysis to approximately unimodal distributions [5], which doesn't differ much from our work in terms of overall landscapes and is a natural follow-up extension.


[1] User-friendly Guarantees for the Langevin Monte Carlo with Inaccurate Gradient. Stochastic Processes and their Applications. 2019.

[2] On Thompson Sampling with Langevin Algorithms. ICML'20.

[3] On the Convergence of Hamiltonian Monte Carlo with Stochastic Gradients. ICML'21.

[4] Non-convex learning via Stochastic Gradient Langevin Dynamics: a nonasymptotic analysis. COLT'17.

Writing related: 
\textbf{Why use $\beta_k$ and $\theta_k$ separately when they are identical.}

We agree that $\beta_k=\theta_k$ when $k\text{ mode } K\neq 0$; however, a divergence appears in Lemma C.2 due to the partial devices and periodic synchronizations $k\text{ mode } K= 0$. Similar techniques are used in Lemma 5 (on page 19) in [1]. 

Thanks for carefully checking the details of our work. We will polish our details to clarify the confusion in the revision.

[1] On the Convergence of FedAvg on Non-IID Data. ICLR'20. arXiv:1907.02189v4. 


\textbf{the connection between the continuous-time dynamics and the FALD is not clear in the main manuscript}

We will include more such discussions in the revision.

\section{Response to WMMq}

\textbf{The additional Gaussian noise can be viewed as an extra noise in the stochastic gradients so the existing convergence results of FedAvg could be directly applied.}

We gently disagree with your argument. Federated learning (FL) via SGD or variants is a stochastic optimization algorithm; while FL via SGLD is a sampling algorithm with theoretical guarantees. The two proofs are fundamentally different and we are the first to provide such a convergence analysis for Bayesian federated learning, which paves the way for global optimization and non-convex learning in the future developments.



\section{Response to ogQa}

We appreciate the valuable comments.

\textbf{Real-world applications are not clear.}

This aggregation of various data resources via federated learning (FL) yields
promising applications in the internet of things [1], healthcare [2], and text data [3]. The mainstream FL methods are based on the optimization framework, which fails to quantify the uncertainty accurately. Such a problem leads to unreliable statistical inference and casts doubts on the credibility of the prediction tasks or diagnoses in medical applications. 

The convex analysis is the first step toward the understanding of sampling algorithms and our work paves the way for a more general analysis [4] of computational complexity in Bayesian federated learning. 

[1] A Joint Learning and Communications Framework for Federated Learning over Wireless Networks. IEEE Trans. on Wireless Communications, 2020.

[2] Multi-site fMRI Analysis using Privacy-preserving Federated Learning and Domain Adaptation: ABIDE Results. Medical Image Analysis, 2020

[3] TextHide: Tackling Data Privacy in Language Understanding Tasks. In EMNLP, 2020.

[4] Federated Learning with a Sampling Algorithm under Isoperimetry.

\textbf{Strongly convex assumptions are too strong. see another response.}

Please see our response to reviewer y47s. Nevertheless, we believe extending our results to approximately unimodal landscapes with bounded LSI is also an important extension and we will include the discussion of [1] in the next revision.

[1] Federated Learning with a Sampling Algorithm under Isoperimetry.


\textbf{to area chairs}

Dear Area Chairs,

We appreciate you taking the time to read our rebuttal.

Federated learning has achieved tremendous progress over the last few years. However, most of the research is focused on stochastic optimization instead of Monte Carlo sampling. To fill the gap, we initiate the first step and propose to prove the convergence of Langevin dynamics in federated learning (FL). Such an analysis not only shows concrete guidance on the computational complexity of sampling in FL but also yields uncertainty guarantees for reliable predictions or diagnoses in applications, such as healthcare and fraud detection.



Despite not being general enough, convex analysis is often the first step for sampling analysis [1,2,3] and also paves the way for future extensions, such as [4]. We believe our work will be a promising contribution to the literature on Bayesian federated learning.

Best,
Authors.


[1] User-friendly Guarantees for the Langevin Monte Carlo with Inaccurate Gradient. Stochastic Processes and their Applications. 2019.

[2] On Thompson Sampling with Langevin Algorithms. ICML'20.

[3] On the Convergence of Hamiltonian Monte Carlo with Stochastic Gradients. ICML'21.

[4] Federated Learning with a Sampling Algorithm under Isoperimetry.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Futher comments on the 3-score reviewer}

Thank you for raising these important questions.

\textbf{There are FedAvg optimization papers without assuming bounded gradients}

Thanks for showing the interesting references to bypass the bounded gradient assumption in $L_2$ in the optimization literature. We will surely include such discussions in the revision. Nevertheless, the bounded gradient is proved using a different method via sampling properties, which is itself an interesting contribution to the federated learning community.

\textbf{Challenges in extending optimization to sampling}

A Monte Carlo sampling algorithm differs from an optimization method in the following aspects:

Equilibrium (from a stationary point to a distribution): an optimization method builds upon a sequence of iterates and aims to converge to a fixed (stationary) point; while a sampling algorithm leverages Markov kernels to converge to the invariant Gibbs distribution $\pi(\theta)\propto e^{-f(\theta)/\tau}$. In other words, we require not only a correct mean but also a true distribution.  

Distance measure (from $L_2$ to $W_2$): the convergence of points can be simply quantified by $L_1$ or $L_2$; by contrast, the convergence of distributions can be modeled by various delicate metrics/ distances, such as Wasserstein metric ($W_1$ or $W_2$ as defined in Line 154), KL divergence, Hellinger distance, and TV distance, among others.

Continuous limit (from ODE to SDE): given infinitesimal stepsizes, an optimization method relies on a standard ODE:  $d\theta=\nabla f(\theta)dt$; a sampling algorithm convergences to an SDE/ Langevin diffusion: $d\theta=\nabla f(\theta)dt+dW_t$, where the later is a nowhere-differentiable Brownian motion and requires careful treatment.

Proof techniques: 
To properly leverage the potential of the Brownian motion, the synchronous coupling technique is involved to derive the crucial contraction property in Lemma B.5; Burkholder-Davis-Gundy inequality and It\^{o} isometry is used to prove the discretization error in Lemma B.2; working on the convergence of distributions (instead of points) also requires other complications, such as the discussion of the continuous limit in A.1.1 and the initial condition in Lemma F.1; the bounded gradient is directly proved by leveraging the distributional properties.


Theoretical potential: 

Although our work also relies on the standard strong convexity property, we emphasize that the extra injected noise is significantly larger than the stochastic gradient noise theoretically and yields promising global properties. Such a global property paves the way for non-convex analysis [1,2,3], a hitting time analysis to regions of interest [4], global optimization [5], and uncertainty estimations [6,7]. By contrast, most optimization methods only have local properties (converge to stationary points) and don't have guarantees in global optimization and uncertainty estimations.

[1] Non-convex learning via Stochastic Gradient Langevin Dynamics: a nonasymptotic analysis. COLT'17.

[2] Global Convergence of Langevin Dynamics Based Algorithms for Nonconvex Optimization. NeurIPS'18.

[3] Federated Learning with a Sampling Algorithm under Isoperimetry. 2022

[4] A Hitting Time Analysis of Stochastic Gradient Langevin Dynamics. COLT'17.

[5] Convex Optimization with Unbounded Nonconvex Oracles using Simulated Annealing. COLT'18.

[6] Consistency and Fluctuations for Stochastic Gradient
Langevin Dynamics. JMLR'16.

[7] Federated Learning via Posterior Averaging: A New Perspective and Practical Algorithms. ICLR'21



\textbf{What is the advantage of the proposed correlated scheme to simply reducing the amount of the additive noise by reducing 
 of the Gaussian vectors in algorithm 3}

Recall that our target is to converge to a target distribution $\pi(\theta)\propto e^{-f(\theta)/\tau}$, where $\tau$ controls the variance and is crucial in uncertainty estimation. Moreover, it also yields a trade-off between exploration and exploitation in non-convex learning, as shown in Figure 1 [1], a larger $\tau$ enables a lower energy barrier with a faster exploration, but hitting the global optima becomes more challenging; while a smaller $\tau$ facilitates optimization but requires exponentially more time to escape local traps. 

[1] Accelerating Convergence of Replica Exchange Stochastic Gradient MCMC via Variance Reduction. ICLR'21.


If we have addressed your concerns, we kindly ask you to consider raising your score. Federated learning has benefited a lot from the optimization literature, however, theoretical guarantees in uncertainty estimation and global optimization are also needed to ensure more reliable predictions.

Thank you again for your insightful comments, which have significantly improved the presentation of this paper.




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
