%\documentclass{uai2023} % for initial submission
 \documentclass[accepted]{uai2023} % after acceptance, for a revised
 % version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\usepackage{pifont}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage{bm,amsmath,amsthm,amssymb,multicol,enumitem,subfigure}
\usepackage{xargs}
\usepackage{stmaryrd}
\usepackage{natbib}
\usepackage{comment}
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{booktabs} % for professional tables
\usepackage{algpseudocode}
\usepackage{algorithm}

\usepackage[T1]{fontenc}
% \usepackage{enumerate}
\usepackage{inputenc}

\usepackage{graphicx} % more modern
\usepackage{subfigure}
\renewcommand*{\thesubfigure}{}
\usepackage{booktabs,balance}
\usepackage{rotating}
\usepackage{boldline}
\usepackage{makecell}
\usepackage{multirow}
\usepackage{balance}



% \usepackage[colorlinks,linkcolor=red,filecolor=blue,citecolor=blue,urlcolor=blue]{hyperref}

% \usepackage{url}
% % \usepackage[algo2e,ruled,noend]{algorithm2e}
% \newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
% \SetCommentSty{mycommfont}
% \setlength{\algomargin}{4pt}

\DeclarePairedDelimiter{\ceil}{\lceil}{\rceil}
\DeclarePairedDelimiter{\floor}{\lfloor}{\rfloor}
\newcommand{\pl}{Polyak-\L{}ojasiewicz}
\newcommand{\todoM}[1]{\textcolor{blue}{ToDo (Farzin): #1}}
\newcommand{\todo}[1]{\textcolor{red}{ToDo:~#1}}
\newcommand{\alert}[1]{\textcolor{red}{#1}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\E}{\mathrm{E}}

\theoremstyle{plain}
\newtheorem{theo}{Theorem}
\newtheorem{remark}[theo]{Remark}
\newtheorem{proposition}[theo]{Proposition}
\newtheorem{lem}[theo]{Lemma}
\newtheorem{coro}[theo]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theo]{Definition}
\newtheorem{assumption}[theo]{Assumption}

\def\M{\mathcal{M}}
\def\A{\mathcal{A}}
\def\Z{\mathcal{Z}}
\def\S{\mathcal{S}}
\def\D{\mathcal{D}}
\def\R{\mathcal{R}}
\def\P{\mathcal{P}}
\def\K{\mathcal{K}}
\def\E{\mathbb{E}}
\def\F{\mathfrak{F}}
\def\l{\boldsymbol{\ell}}

\newtheorem*{Lemma*}{Lemma}
\newtheorem*{Theorem*}{Theorem}
\newtheorem*{Corollary*}{Corollary}

\newcommand{\eqsp}{\;}
\newcommand{\beq}{\begin{equation}}
	\newcommand{\eeq}{\end{equation}}
\newcommand{\eqdef}{\mathrel{\mathop:}=}
\def\EE{\mathbb{E}}
\newcommand{\norm}[1]{\left\Vert #1 \right\Vert}
\newcommand{\pscal}[2]{\left\langle#1\,|\,#2 \right\rangle}
\def\major{\mathsf{M}}
\def\rset{\ensuremath{\mathbb{R}}}
\newcommand{\inter}{\llbracket n \rrbracket}
\newcommand{\interl}{\llbracket L \rrbracket}

\def\tot{\mathsf{h}}

\newcommand{\sign}{\text{sign}}
\newcommand{\ie}{{\em i.e.,~}}

\newcommand{\algo}{\textsc{Fed-LAMB}}


\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{multirow}
\usepackage{makecell}
\usepackage{pifont}
\usepackage{mathtools}
\usepackage{balance}

\usepackage{xcolor}
\usepackage{tikz}
\usetikzlibrary{tikzmark,calc}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Fed-LAMB: Layer-wise and Dimension-wise Locally Adaptive\\
Federated Learning (Supplemental Material)}



\author{Belhal Karimi, Ping Li, Xiaoyun Li\\
 Cognitive Computing Lab\\
Baidu Research\\
 10900 NE 8th St, Bellevue, WA 98004, USA \\
\texttt{\{belhal.karimi, pingli98, lixiaoyun996\}@gmail.com}
}


\begin{document}
\onecolumn

\maketitle

\appendix


\vspace{0.5in}

\section{Experiment Details and Results} \label{app:experiment}


\subsection{The Adp-Fed Algorithm}

The Adp-Fed (Adaptive Federated Optimization) is one of the baseline methods compared with Fed-LAMB in our paper. The algorithm is given in Algorithm~\ref{alg:adp-fed}. The key difference between Adp-Fed and Fed-AMS~\citep{chen2020toward} is that, in Adp-Fed, each client runs local SGD (Line~8), and an Adam optimizer is maintained for the global adaptive optimization (Line~15). In the Fed-AMS framework (as well as our Fed-LAMB), each clients runs local (adaptive) AMSGrad method, and the global model is simply obtained by averaging the local models. \citep{li2023analysis} proposed a variant of Adp-Fed algorithm with communication compression.


\begin{algorithm}[H]
\caption{Adp-Fed: Adaptive Federated Optimization~\citep{reddi2021adaptive}} \label{alg:adp-fed}
\begin{algorithmic}[1]
%\small
\State \textbf{Input}: parameter $0< \beta_1, \beta_2 <1$, and learning rate $\alpha_t$, weight decaying parameter $\lambda \in [0,1]$.
\State \textbf{Initialize}: $\theta_{0,i} \in \Theta \subseteq \mathbb R^d $, $m_0=0$, $v_{0} =\epsilon$, $\forall i\in \llbracket n\rrbracket$, and $\theta_0 =  \frac{1}{n} \sum_{i=1}^n \theta_{0,i}$.
\vspace{0.05in}
\State \textbf{for $r=1, \ldots, R$ do}
\State $\quad$\textbf{parallel for device $i$ do}:
\State $\qquad$Set $\theta_{r,i}^{0} = \theta_{r-1}$.

\State $\qquad$\textbf{for $t=1, \ldots, T$ do}
\State $\qquad\quad$Compute stochastic gradient $g^t_{r,i}$ at $\theta_{r,i}^{0}$.
\State $\qquad\quad$$\theta_{r,i}^t=\theta_{r,i}^{t-1}-\eta_l g_{r,i}^t$ \label{adpfed line:local SGD} 
\State $\qquad$\textbf{end for}


\State $\qquad$Devices send $\triangle_{r,i}=\theta_{r,i}^T-\theta_{r,i}^0$ to server.

\State $\quad$\textbf{end for}

\State \quad Server computes $\bar{\triangle}_r = \frac{1}{n}\sum_{i=1}^n \triangle_{r,i}$

\State \quad $m_r = \beta_1 m_{r-1} + (1-\beta_1)\bar{\triangle}_r$

\State \quad $v_r = \beta_2 v_{r-1} + (1-\beta_2)\bar{\triangle}_r^2$

\State \quad $\theta_r = \theta_{r-1}+\eta_g\frac{m_r}{\sqrt{v_r}}$ \label{adpfed line:global adam}

\State \textbf{end for}
\State \textbf{Output}: Global model parameter $\theta_R$.
\end{algorithmic}
\end{algorithm}




\subsection{Hyper-parameter Tuning}

In our empirical study, we tune the learning rate of each algorithm carefully such that the best performance is achieved. The search grids in all our experiments are provided in Table~{\ref{tab:tuning}}. 

\begin{table}[h]
\centering
\caption{Search grids of the learning rate.}\label{tab:tuning}
% 	\resizebox{0.9\columnwidth}{!}{%
\begin{tabular}{c|c}
\toprule[1pt]
 & Learning rate range     \\ \hline  
Fed-SGD                  & $[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5]$                      \\\hline 
Fed-AMS                  & $[0.0001,0.0003,0.0005,0.001,0.003,0.005,0.01,0.03,0.05,0.1]$ \\\hline 
Fed-LAMB     & $[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5]$                      \\\hline 
\multirow{2}{*}{Adp-Fed} & Local $\eta_l$: $[0.0001,0.0003,0.0005,0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5]$      \\
    & Global $\eta_g$: $[0.0001,0.0003,0.0005,0.001,0.003,0.005,0.01,0.03,0.05,0.1]$ \\\hline 
Mime                  & $[0.0001,0.0003,0.0005,0.001,0.003,0.005,0.01,0.03,0.05,0.1]$ \\\hline 
Mime-LAMB     & $[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5]$                      \\
\toprule[1pt]
\end{tabular}

\end{table}

\subsection{More Workers}

In Figure~\ref{fig:client200}, we provide additional figures with larger number of workers $n=200$, on MNIST and FMNIST with non-IID data. The conclusions stay the same: we see that the proposed Fed-LAMB and Mime-LAMB perform much better than the baseline algorithms, with faster convergence and better accuracy at the end of 100 FL training rounds.


\begin{figure}[h]
  \begin{center}
  \mbox{
    \includegraphics[width=2.5in]{figure_final/mnist_testerror_cnn_ep1_client200_iid0.pdf}
    \includegraphics[width=2.5in]{figure_final/fmnist_testerror_cnn_ep1_client200_iid0.pdf}
    }
  \caption{Test accuracy on MNIST and FMNIST with $n=200$ workers, full participation, local batch size 64. Data are non-IID distributed among clients.}
  \label{fig:client200}
  \end{center}
\end{figure}



\newpage

\section{Theoretical Analysis}\label{app:proofs}

We first recall in Table~\ref{tab:notationsapp} some important notations that will be used in our following analysis.
 
\begin{table}[H]
%\caption{Table of Notations}
\begin{center}% used the environment to augment the vertical space
% between the caption and the table
\begin{tabular}{r c p{12cm} }
\toprule
$R, T$ & $\eqdef$ &  Number of communications rounds and local iterations (resp.)\\
$n, D, i$ & $\eqdef$ &  Total number of clients, portion sampled uniformly and client index \\
$\tot, \ell$ & $\eqdef$ &  Total number of layers in the DNN and its index \\
$\phi(\cdot)$ & $\eqdef$ &  Scaling factor in Fed-LAMB update\\
$\bar{\theta}$ & $\eqdef$ &  Global model (after periodic averaging)\\
$\psi_{r,i}^{t}$ & $\eqdef$ &  ratio computed at round $r$, local iteration $t$ and for device $i$. $\psi_{r,i}^{\ell,t}$ denotes its component at layer $\ell$\\
\bottomrule
\end{tabular}
\end{center}
\caption{Summary of notations used in the paper.}
\label{tab:notationsapp}
\end{table}


We now provide the proofs for the theoretical results of the main paper, including the intermediary Lemmas and the main convergence result, Theorem 5.


\subsection{Intermediary Lemma}

We now develop the proof of the convergence rate of Fed-LAMB. We need a supporting Lemma~\ref{lemma:iterates} for this.

\vspace{0.05in}
\begin{lem}\label{lemma:iterates}
Consider $\{\overline{\theta_r}\}_{r>0}$, the sequence of parameters obtained running Algorithm~1. Then for $i \in \inter$:
\beq\notag
\| \overline{\theta_r} - \theta_{r,i} \|^2 \leq \alpha^2 M^2 T^2 \phi_M^2 \frac{(1-\beta_2)p}{\epsilon} \eqsp,
\eeq
where $\phi_M$ is defined in Assumption~4 and p is the total number of dimensions $p = \sum_{\ell = 1}^\tot p_\ell$.
\end{lem}

\begin{proof}
Assuming the simplest case when $T=1$, i.e., one local iteration, then by construction of Algorithm 1, we have for all $\ell \in \llbracket \tot \rrbracket$, $i \in \inter$ and $r >0$:
\beq\notag
 \theta^{\ell}_{r,i} =  \overline{\theta_r}^{\ell}  - \alpha \sum_{t=1}^T \phi(\|\theta_{r,i}^{\ell,t-1}\|)\psi_{r,i}^{j} / \|\psi_{r,i}^{\ell}\|=  \overline{\theta_r}^{\ell}  - \alpha \sum_{t=1}^T\phi(\|\theta_{r,i}^{\ell,t-1}\|)  
 \frac{m^{t}_{r,i}}{\sqrt{v^{t}_{r}}} \frac{1}{\|\psi_{r,i}^{\ell}\|}
\eeq
leading to 
\beq\notag
\begin{split}
\|\overline{\theta_r}   -  \theta_{r,i}\|^2  = \sum_{\ell=1}^\tot \Vert\overline{\theta_r}^{\ell}   -  \theta^{\ell}_{r,i}\Vert^2 \leq \alpha^2 M^2 T^2 \phi_M^2 \frac{(1-\beta_2)p}{\epsilon} \eqsp,
\end{split}
\eeq
which concludes the proof.
\end{proof}



\subsection{Proof of Theorem 5 } \label{app:proofmain}


\begin{Theorem*}
Suppose \textbf{Assumption 1 - Assumption 4} holds. Consider $\{\overline{\theta_r}\}_{r>0}$, the sequence of parameters obtained running Algorithm 1 with a constant learning rate $\alpha$. Let the number of local epochs be $T \geq 1$ and $\lambda = 0$. Then, for any round $R > 0$, we have
\begin{align}
  \frac{1}{R}\sum_{r=1}^R  \EE\left[ \left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}}   \right \|^2 \right] &\leq    \sqrt{\frac{M^2 p}{n}}  \frac{ \triangle}{\tot \alpha R}+\frac{4\alpha^2 L M^2 T^2 \phi_M^2 (1-\beta_2)p}{\sqrt{\epsilon}} \\\notag
&+4\alpha^2 \frac{M^2}{\sqrt{\epsilon}} +      \frac{\phi_M   \sigma^2}{R n} \sqrt{\frac{1 - \beta_2}{M^2 p}  } +4\alpha \left[\phi_M \frac{\tot \sigma^2}{\sqrt{n}}\right]     + 4\alpha^2 \left[ \phi_M^2\sqrt{M^2+p\sigma^2} \right],\notag
\end{align}
where $\triangle=\EE[f(\bar{\theta}_1)]  - \min \limits_{\theta \in \Theta} f(\theta)$.
\end{Theorem*}

\begin{proof}
Our proof will make use of an intermediary virtual sequence defined as
\beq\label{eq:defseq}
\bar{\vartheta}_r = \bar{\theta}_r +  \frac{\beta_1}{1-\beta_1}(\bar{\theta}_{r} - \bar{\theta}_{r-1}) \eqsp,
\eeq
where $\bar{\theta_r}$ denotes the average of the local models at round $r$.
Then for each layer $\ell$,
\begin{align}\label{eq:gap}
\bar{\vartheta}^\ell_{r+1} - \bar{\vartheta}^\ell_r  & = \frac{1}{1-\beta_1}(\bar{\theta}^\ell_{r+1} - \bar{\theta}^\ell_{r}) - \frac{\beta_1}{1-\beta_1}(\bar{\theta}^\ell_{r} - \bar{\theta}^\ell_{r-1}) \nonumber\\
& = \frac{\alpha_{r}}{1-\beta_1} \frac{1}{n} \sum_{i = 1}^n \frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\|\psi_{r,i}^{\ell}\|} \psi_{r,i}^{\ell}  - \frac{\alpha_{r-1}}{1-\beta_1} \frac{1}{n} \sum_{i = 1}^n \frac{\phi(\|\theta_{r-1,i}^{\ell}\|)}{\|\psi_{r-1,i}^{\ell}\|} \psi_{r-1,i}^{\ell}\nonumber\\
& = \frac{\alpha \beta_1}{1-\beta_1} \frac{1}{n}  \sum_{i = 1}^n  \left( \frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\sqrt{v^{t}_{r}} \|\psi_{r,i}^{\ell}\|} - \frac{\phi(\|\theta_{r-1,i}^{\ell}\|)}{\sqrt{v^{t}_{r-1}} \|\psi_{r-1,i}^{\ell}\|} \right) m^{t}_{r-1} + \frac{\alpha}{n} \sum_{i = 1}^n \frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\sqrt{v^{t}_{r}} \|\psi_{r,i}^{\ell}\|} g^t_{r,i} \eqsp,
\end{align}
where we have assumed a constant learning rate $\alpha$.

Using Assumption 1, we have
\begin{align}\notag
f(\bar{\vartheta}_{r+1}) &  \leq f(\bar{\vartheta}_r) + \pscal{\nabla f(\bar{\vartheta}_r)}{\bar{\vartheta}_{r+1} - \bar{\vartheta}_r} + \sum_{\ell =1}^L \frac{L_\ell}{2} \| \bar{\vartheta}^\ell_{r+1} - \bar{\vartheta}^\ell_r \|^2\\\notag
&  \leq f(\bar{\vartheta}_r) + \sum_{\ell=1}^\tot \sum_{j=1}^{p_\ell} \nabla_{\ell} f(\bar{\vartheta}_r)^j (\bar{\vartheta}^{\ell,j}_{r+1} - \bar{\vartheta}^{\ell,j}_r) + \sum_{\ell =1}^L \frac{L_\ell}{2} \| \bar{\vartheta}^\ell_{r+1} - \bar{\vartheta}^\ell_r \|^2  \eqsp.
\end{align}
Taking expectations on both sides leads to
\begin{align}\label{eq:main}
- \EE[  \pscal{\nabla f(\bar{\vartheta}_r)}{\bar{\vartheta}_{r+1} - \bar{\vartheta}_r}]  \leq  \EE[ f(\bar{\vartheta}_r) - f(\bar{\vartheta}_{r+1})] + \sum_{\ell =1}^L \frac{L_\ell}{2} \EE[  \| \bar{\vartheta}^\ell_{r+1} - \bar{\vartheta}^\ell_r \|^2] \eqsp.
\end{align}



We note for all $\theta \in \Theta$, the majorant $G > 0$ such that $\phi(\|\theta \|) \leq G$. 
Then, following \eqref{eq:main}, we obtain
\begin{align}\label{eq:main2}
- \EE[  \pscal{\nabla f(\bar{\vartheta}_r)}{\bar{\vartheta}_{r+1} - \bar{\vartheta}_r}]  \leq  \EE[ f(\bar{\vartheta}_r) - f(\bar{\vartheta}_{r+1})] + \sum_{\ell =1}^L \frac{L_\ell}{2} \EE[  \| \bar{\vartheta}_{r+1} - \bar{\vartheta}_r \|^2] \eqsp.
\end{align}
Developing the LHS of \eqref{eq:main2} using \eqref{eq:gap} leads to
\begin{align} \notag
\pscal{\nabla f(\bar{\vartheta}_r)}{\bar{\vartheta}_{r+1} - \bar{\vartheta}_r} &= \sum_{\ell=1}^\tot \sum_{j=1}^{p_\ell} \nabla_{\ell} f(\bar{\vartheta}_r)^j (\bar{\vartheta}^{\ell,j}_{r+1} - \bar{\vartheta}^{\ell,j}_r)  \\ \notag
& =  \frac{\alpha \beta_1}{1-\beta_1}\frac{1}{n}  \sum_{\ell=1}^\tot \sum_{j=1}^{p_\ell} \nabla_{\ell} f(\bar{\vartheta}_r)^j \left[   \sum_{i = 1}^n  \left( \frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\sqrt{v^{t}_{r}} \|\psi_{r,i}^{\ell}\|} - \frac{\phi(\|\theta_{r-1,i}^{\ell}\|)}{\sqrt{v^{t}_{r-1}} \|\psi_{r-1,i}^{\ell}\|} \right) m^{t}_{r-1}  \right] \\ \label{eqn1}
& \underbrace{ -\frac{\alpha}{n} \sum_{\ell=1}^\tot \sum_{j=1}^{p_\ell} \nabla_{\ell} f(\bar{\vartheta}_r)^j  \sum_{i = 1}^n \frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\sqrt{v^{t}_{r}} \|\psi_{r,i}^{\ell}\|} g_{r,i}^{t,l,j}}_{= A_1}   \eqsp.
\end{align}
Suppose $T$ is the total number of local iterations and $R$ is the number of rounds. We can write~\eqref{eqn1}~as
\begin{align}\notag
    A_1=-\alpha \langle \nabla f(\bar \vartheta_r),\frac{\bar g_r}{\sqrt{\hat v_r}} \rangle,
\end{align}
where $\bar g_r=\frac{1}{n}\sum_{i=1}^n \bar g_{t,i}$, with $\bar g_{t,i}=\Big[\frac{\phi(\Vert \theta_{t,i}^1\Vert)}{\Vert \psi_{t,i}^1\Vert}g_{t,i}^1,..., \frac{\phi(\Vert \theta_{t,i}^L\Vert)}{\Vert \psi_{t,i}^L\Vert}g_{t,i}^L   \Big]$ representing the normalized gradient (concatenated by layers) of the $i$-th device. It holds that
\begin{align}
    \langle \nabla f(\bar \vartheta_r),\frac{\bar g_r}{\sqrt{\hat v_r}} \rangle&=\frac{1}{2}\Vert \frac{\nabla f(\bar\vartheta_r) }{\hat v_r^{1/4}}\Vert^2+\frac{1}{2}\Vert \frac{\bar g_r }{\hat v_r^{1/4}}\Vert^2-\Vert \frac{\nabla f(\bar\vartheta_r)-\bar g_r }{\hat v_r^{1/4}}\Vert^2.  \label{eqn:x1}
\end{align}


To bound the last term on the RHS, we have
\begin{align}\notag
    \Vert \frac{\nabla f(\bar\vartheta_r)-\bar g_r }{\hat v_r^{1/4}}\Vert^2=\Vert \frac{\frac{1}{n}\sum_{i=1}^n (\nabla f(\bar\vartheta_r)-\bar g_{t,i})}{\hat v_r^{1/4}} \Vert^2
    &\leq \frac{1}{n}\sum_{i=1}^n\Vert \frac{\nabla f(\bar\vartheta_r)-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2\\\notag
    &\leq \frac{2}{n}\sum_{i=1}^n \Big(\Vert \frac{\nabla f(\bar\vartheta_r)-\nabla f(\bar\theta_r)}{\hat v_r^{1/4}} \Vert^2+\Vert \frac{\nabla f(\bar\theta_r)-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2  \Big). 
\end{align}
By Lipschitz smoothness of the loss function, the first term admits
\begin{align}\notag
    \frac{2}{n}\sum_{i=1}^n\Vert \frac{\nabla f_i(\bar\vartheta_r)-\nabla f_i(\bar\theta_r)}{\hat v_r^{1/4}} \Vert^2 \leq \frac{2}{n \sqrt{\epsilon}}\sum_{i=1}^n L_\ell\Vert \bar\vartheta_r-\bar\theta_r\Vert^2 & =\frac{2L_\ell}{n \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2}\sum_{i=1}^n \Vert \bar\theta_r-\bar\theta_{t-1}\Vert ^2\\\notag
    &\leq \frac{2\alpha^2 L_\ell }{n \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2} \sum_{l=1}^L \sum_{i=1}^n\Vert \frac{\phi(\Vert \theta_{t,i}^l\Vert)}{\Vert \psi_{t,i}^l\Vert}\psi_{t,i}^l \Vert^2\\\notag
    &\leq \frac{2\alpha^2 L_\ell p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2}.
\end{align}
For the second term,
\begin{align}\label{eq:inter}
    \frac{2}{n}\sum_{i=1}^n\Vert \frac{\nabla f(\bar\theta_r)-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2 \leq \frac{4}{n}\Big( \underbrace{\sum_{i=1}^n \Vert \frac{\nabla f(\bar\theta_r)-\nabla f(\theta_{t,i})}{\hat v_r^{1/4}} \Vert^2}_{B_1} + \underbrace{ \sum_{i=1}^n\Vert \frac{\nabla f(\theta_{t,i})-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2}_{B_2} \Big).
\end{align}
Using the smoothness of $f_i$ we can transform $B_1$ into consensus error by
\begin{align}\notag
    B_1\leq \frac{L}{\sqrt{\epsilon}}\sum_{i=1}^n \Vert \bar\theta_r - \theta_{t,i}\Vert^2  & =\frac{\alpha^2 L}{\sqrt{\epsilon}}\sum_{i=1}^n\sum_{l=1}^L \| \sum_{j=\lfloor t \rfloor_r+1}^t \Big( \frac{\phi(\Vert \theta_{j,i}^l\Vert)}{\Vert \psi_{j,i}^l\Vert}\psi_{j,i}^l-\frac{1}{n}\sum_{k=1}^n \frac{\phi(\Vert \theta_{j,k}^l\Vert)}{\Vert \psi_{j,k}^l\Vert}\psi_{j,k}^l \Big) \|^2\\\label{eqn:B1}
    &\leq n \frac{\alpha^2 L}{\sqrt{\epsilon}} M^2 T^2 \phi_M^2 (1-\beta_2)p,
\end{align}
where the last inequality stems from Lemma~\ref{lemma:iterates} in the particular case where $  \theta_{t,i}$ are averaged every $ct+1$ local iterations for any integer $c$, since $(t-1)-(\lfloor t \rfloor_r+1)+1 \leq T-1$.


%\newpage


We now bound $B_2$ (under the simplification that $\beta_1 = 0$):

\begin{align}\notag
    \mathbb E[B_2]&=\mathbb E[\sum_{i=1}^n\Vert \frac{\nabla f(\theta_{t,i})-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2] \\\notag
    &\leq \frac{nM^2}{\sqrt{\epsilon}}+n\phi_M^2\sqrt{M^2+p\sigma^2}-2\sum_{i=1}^n\mathbb E[\langle \nabla f(\theta_{t,i}),\bar g_{t,i} \rangle/\sqrt{\hat v_r}]\\\notag
    &=\frac{nM^2}{\sqrt{\epsilon}}+n\phi_M^2\sqrt{M^2+p\sigma^2}-2\sum_{i=1}^n \sum_{\ell=1}^L \mathbb E[\langle \nabla_\ell f(\theta_{t,i}),\frac{\phi(\|\theta_{t,i}^l \|)}{\| \psi_{t,i}^l \|}g_{t,i}^l \rangle/\sqrt{\hat v_r^l}]\\\notag
    &=\frac{nM^2}{\sqrt{\epsilon}}+n\phi_M^2\sqrt{M^2+p\sigma^2}-2\sum_{i=1}^n \sum_{l=1}^L\sum_{i=1}^{p_l} \mathbb E[\nabla_l f(\theta_{t,i})^j\frac{\phi(\|\theta_{t,i}^{l,j} \|)}{\sqrt{\hat v_r^{l,j}}\| \psi_{t,i}^{l,j} \|}g_{t,i}^{l,j} ]\\\notag
    & \leq \frac{nM^2}{\sqrt{\epsilon}}+n\phi_M^2\sqrt{M^2+p\sigma^2}-2\sum_{i=1}^n \sum_{l=1}^L\sum_{i=1}^{p_l} \mathbb E \left[ \sqrt{\frac{1-\beta_2}{M^2 p_\ell}}  \phi(\|\theta_{r,i}^{l,j}\|)  \nabla_l f(\theta_{t,i})^j  g_{t,i}^{l,j}\right]\\\notag
    &\hspace{0.4in} -2 \sum_{i = 1}^n \sum_{l=1}^L\sum_{j=1}^{p_l}  E \left[  \left( \phi(\|\theta_{r,i}^{l,j}\|)   \nabla_l f(\theta_{t,i})^j   \frac{g_{r,i}^{t,l,j}}{ \|\psi_{r,i}^{l,j}\|}\right)\mathsf{1}\left( \sign(  \nabla_l f(\theta_{t,i})^j \neq  \sign( g_{r,i}^{t,l,j}) \right)\right],
\end{align}
where we use Assumption 2, Assumption 3 and Assumption 4. 
Yet,
\begin{align*}
&- \mathbb E \Bigg[  \left( \phi(\|\theta_{r,i}^{l,j}\|)   \nabla_l f(\theta_{t,i})^j   \frac{g_{r,i}^{t,l,j}}{ \|\psi_{r,i}^{l,j}\|}\right)\mathsf{1}\left( \sign(  \nabla_l f(\theta_{t,i})^j
\neq  \sign( g_{r,i}^{t,l,j}) \right)\Bigg] \\
&\hspace{2in} \leq  \phi_M \nabla_l f(\theta_{t,i})^j   \mathbb{P}\left[  \sign(  \nabla_l f(\theta_{t,i})^j \neq  \sign( g_{r,i}^{t,l,j}) \right].
\end{align*}
Then we have
\begin{align}\notag
    \mathbb E[B_2]\leq  \frac{nM^2}{\sqrt{\epsilon}}+n\phi_M^2\sqrt{M^2+p\sigma^2}-2 \phi_m \sqrt{\frac{1-\beta_2}{M^2 p}} \sum_{i=1}^n \E[\| [\nabla f(\theta_{t,i}) \|^2] + \phi_M \frac{\tot \sigma^2}{\sqrt{n}}
\end{align}
Thus, \eqref{eq:inter} becomes
\begin{align}\notag
    \frac{2}{n}\sum_{i=1}^n\Vert \frac{\nabla f_i(\bar\theta_r)-\bar g_{t,i}}{\hat v_r^{1/4}} \Vert^2 \leq 4 \left[ \frac{\alpha^2 L_\ell}{\sqrt{\epsilon}} \alpha^2 M^2 T^2 \phi_M^2 (1-\beta_2)p + \frac{\alpha M^2}{\sqrt{\epsilon}}+\phi_M^2\sqrt{M^2+p\sigma^2} + \alpha\phi_M \frac{\tot \sigma^2}{\sqrt{n}}\right]
\end{align}

Substituting all ingredients into (\ref{eqn:x1}), we obtain
\begin{align}\notag
    -\alpha \mathbb E[\langle \nabla f(\bar \vartheta_r),\frac{\bar g_r}{\sqrt{\hat v_r}} \rangle] &\leq -\frac{\alpha}{2}\mathbb E\big[\Vert \frac{\nabla f(\bar\vartheta_r) }{\hat v_r^{1/4}}\Vert^2 \big]-\frac{\alpha}{2}\mathbb E\big[\Vert \frac{\bar g_r }{\hat v_r^{1/4}}\Vert^2 \big]+\frac{2\alpha^3 L_\ell p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2} \\\notag
    &\hspace{0.1in}  + 4 \alpha \left[ \frac{\alpha^2 L}{\sqrt{\epsilon}} M^2 T^2 \phi_M^2 (1-\beta_2)p + \frac{\alpha M^2}{\sqrt{\epsilon}}+\phi_M^2\sqrt{M^2+p\sigma^2} +\alpha \phi_M \frac{\tot \sigma^2}{\sqrt{n}}\right].
\end{align}

%\newpage

To bound the second term on the RHS in above, we notice that
\begin{align}\notag
    \mathbb E\big[\Vert \frac{\bar g_r }{\hat v_r^{1/4}}\Vert^2\big]=\frac{1}{n^2}\mathbb E\big[\Vert \frac{\sum_{i=1}^n \bar g_{r,i}}{\hat v_r^{1/4}}\Vert^2 \big] &=\frac{1}{n^2}\mathbb E\big[ \sum_{l=1}^L \sum_{i=1}^n \Vert  \frac{\phi(\Vert \theta_{r,i}^l\Vert)}{\hat v^{1/4} \Vert \psi_{r,i}^l\Vert}g_{r,i}^l \Vert^2 \big] \\
    &\geq \phi_m^2(1-\beta_2) \mathbb E\left[ \Vert \frac{1}{n}\sum_{i=1}^n \frac{\nabla f(\theta_{r,i})}{\hat v^{1/4}_r} \Vert^2 \right]\\\notag
    &=\phi_m^2(1-\beta_2) \mathbb E\left[ \Vert  \frac{\overline\nabla f(\theta_{r})}{\hat v^{1/4}_r} \Vert^2 \right]. \notag
\end{align}


Regarding $\left\| \frac{\overline{\nabla}f(\theta_r)}{\hat v_r^{1/4}} \right\|^2$, we have
\begin{align*}
\left\| \frac{\overline{\nabla}f(\theta_r)}{\hat v_r^{1/4}} \right\|^2 & \geq \frac{1}{2} \left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2 - \left\| \frac{\overline{\nabla}f(\theta_r)- \nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2\\
& \geq \frac{1}{2} \left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2 - \left\| \frac{\frac{1}{n}\sum_{i=1}^n (\nabla f_i(\theta_{r})-\nabla f(\bar\theta_r))}{\hat v_r^{1/4}} \right\|^2 \\
&\geq \frac{1}{2} \left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2 - \frac{\alpha^2 L_\ell}{\sqrt{\epsilon}} M^2 T^2 (\sigma^2 + G^2) (1-\beta_2)p,
\end{align*}
where the last line is due to (\ref{eqn:B1}) and Assumption~3. Therefore, we have obtained
\begin{align*}
    A_1&\leq -\frac{\alpha\phi_m^2(1-\beta_2)}{4}\left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2+\frac{\alpha^3 L_\ell}{\sqrt{\epsilon}} M^2 T^2 \phi_m^2\phi_M^2 (1-\beta_2)^2p+\frac{2\alpha^3 L_\ell p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2} \\
    &\hspace{0.4in}  + 4\alpha \left[ \frac{\alpha^2 L}{\sqrt{\epsilon}}  M^2 T^2 (\sigma^2 + G^2) (1-\beta_2)p + \frac{M^2 \alpha }{\sqrt{\epsilon}}+ \alpha \phi_M^2\sqrt{M^2+p\sigma^2} + \phi_M \alpha \frac{\tot \sigma^2}{\sqrt{n}}\right],\\
    &\leq -\frac{\alpha\phi_m^2(1-\beta_2)}{4}\left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}} \right\|^2+\frac{\alpha^3 L_\ell}{\sqrt{\epsilon}} M^2 T^2 \phi_m^2\phi_M^2 (1-\beta_2)^2p+\frac{2\alpha^3 L_\ell p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2} \\
    &\hspace{0.4in}  + 4 \alpha\Big[ \frac{\alpha^2 L}{\sqrt{\epsilon}}  M^2 T^2 G^2 (1-\beta_2)p + \frac{M^2 \alpha }{\sqrt{\epsilon}}+ \alpha \phi_M^2\sqrt{M^2+p\sigma^2} \\
    &\hspace{1.7in} + \sigma^2 \left(\frac{\alpha^2 L}{\sqrt{\epsilon}}  M^2 T^2(1-\beta_2)p+ \phi_M \alpha \frac{\tot }{\sqrt{n}} \right)\Big].
\end{align*}
Substitute back into (\ref{eqn1}), assuming $M\leq 1$, we have the following by taking the telescope sum
\begin{align*}
    &\frac{1}{R}\sum_{t=1}^R  \EE\left[ \left\| \frac{\nabla f(\overline{\theta_r})}{\hat v_r^{1/4}}   \right \|^2 \right] \\
    & \lesssim  \sqrt{\frac{M^2 p}{n}} \frac{ f(\bar{\vartheta}_1)  - \EE[ f(\bar{\vartheta}_{R+1})]}{\tot \alpha R}+   \frac{\alpha}{n^2}  \sum_{r=1}^R  \sum_{i = 1}^n  \sigma_i^2 \EE\left[ \left\|\frac{\phi(\|\theta_{r,i}^{\ell}\|)}{\sqrt{v_r} \|\psi_{r,i}^{\ell}\|} \right\|^2 \right] +\frac{2\alpha^3 \overline{L} p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2} \nonumber\\
   &   +4 \Big[ \frac{\alpha^2 \overline{L}}{\sqrt{\epsilon}}  M^2 T^2 G^2 (1-\beta_2)p + \frac{\alpha M^2}{\sqrt{\epsilon}}+ \alpha \phi_M^2 \sqrt{M^2+p\sigma^2} \nonumber \\
   & + \sigma^2 \left(\frac{\alpha^2 \overline{L}}{\sqrt{\epsilon}}  M^2 T^2(1-\beta_2)p+ \phi_M\alpha \frac{\tot }{\sqrt{n}} \right)\Big]  +\frac{\alpha \beta_1}{1-\beta_1}  \sqrt{(1-\beta_2)p} \frac{\tot M^2}{\sqrt{\epsilon}} +\overline{L} \alpha^2 M^2 \phi_M^2 \frac{(1-\beta_2)p}{T\epsilon} \nonumber\\
   & \leq   \sqrt{\frac{M^2 p}{n}}  \frac{ \EE[f(\bar{\theta}_1)]  - \min \limits_{\theta \in \Theta} f(\theta)}{\tot \alpha R} +      \frac{\phi_M   \sigma^2}{R n} \sqrt{\frac{1 - \beta_2}{M^2 p}  } \\
   &+4 \Big[ \frac{\alpha^2 \overline{L}}{\sqrt{\epsilon}}  M^2 T^2 G^2 (1-\beta_2)p + \frac{M^2 \alpha }{\sqrt{\epsilon}}+\phi_M^2 \alpha \sqrt{M^2+p\sigma^2} \\
   & + \sigma^2 \Big(\frac{\alpha^2 \overline{L}}{\sqrt{\epsilon}}  M^2 T^2(1-\beta_2)p+ \phi_M \frac{\tot }{\sqrt{n}} \Big)\Big]+\frac{\alpha \beta_1}{1-\beta_1}  \sqrt{(1-\beta_2)p} \frac{\tot M^2}{\sqrt{\epsilon}} \\
   &\hspace{1.4in} +\overline{L} \alpha^2 M^2 \phi_M^2 \frac{(1-\beta_2)p}{T\epsilon} +\frac{2\alpha^3 \overline{L} p\phi_M^2}{ \sqrt{\epsilon}}\frac{\beta_1^2}{(1-\beta_1)^2}.
\end{align*}
Organizing terms, we conclude the proof.
\end{proof}

\bibliographystyle{plainnat}
\bibliography{karimi_320}

\end{document}
