% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
%\usepackage{xr} 
%\externaldocument{xiao_524}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage[ruled,vlined]{algorithm2e} %linesnumbered
\usepackage{amssymb}
\usepackage{amsmath, amsthm, amsfonts, mathrsfs}
\usepackage{bbm}
\usepackage{bm}
\usepackage{color, colortbl}
\definecolor{LightCyan}{rgb}{0.88,1,1}
\usepackage{dirtytalk}
\usepackage{dsfont}
\usepackage{enumerate}
\usepackage{fullpage}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{mathtools}
\usepackage{subfigure}
\usepackage{enumitem}
\usepackage{url}
\usepackage{xspace}
\usepackage{booktabs}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{makecell}
\usepackage{multirow, array}

\usepackage[capitalize,noabbrev]{cleveref}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}[section]
\newtheorem{proposition}{Proposition}
% \newtheorem{lemma}[theorem]{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newenvironment{hproof}{%
  \renewcommand{\proofname}{Proof Sketch}\proof}{\endproof}

\theoremstyle{remark}
\newtheorem*{remark}{Remark}


\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\mA}{\mathbf{A}}
\newcommand{\mB}{\mathbf{B}}
\newcommand{\mG}{\mathbf{G}}
\newcommand{\mI}{\mathbf{I}}
\newcommand{\mJ}{\mathbf{J}}
\newcommand{\mV}{\mathbf{V}}
\newcommand{\mW}{\mathbf{W}}
\newcommand{\mX}{\mathbf{X}}
\newcommand{\mY}{\mathbf{Y}}
\newcommand{\mZ}{\mathbf{Z}}
\newcommand{\mU}{\mathbf{U}}
\newcommand{\mF}{\mathbf{F}}

\def\setF{\mathscr{F}} % Set F

\newcommand{\integerset}{\mathbb{Z}}
\newcommand{\naturalset}{\mathbb{N}}
\newcommand{\realset}{\mathbb{R}}

\newcommand{\diag}[1]{\mathrm{diag}\left(#1\right)}
\newcommand{\domain}[1]{\mathrm{dom}\left(#1\right)}
\newcommand{\range}[1]{\mathrm{rng}\left[#1\right]}

\newcommand{\E}{\mathbb{E}}
\newcommand{\Et}[1]{\mathbb{E}_t \left[#1\right]}
\newcommand{\prob}[1]{\mathbb{P} \left(#1\right)}
\newcommand{\condprob}[2]{\mathbb{P} \left(#1 \,\middle|\, #2\right)}
\newcommand{\probt}[1]{\mathbb{P}_t \left(#1\right)}
\newcommand{\var}[1]{\mathrm{var} \left[#1\right]}
\newcommand{\condvar}[2]{\mathrm{var} \left[#1 \,\middle|\, #2\right]}
\newcommand{\std}[1]{\mathrm{std} \left[#1\right]}
\newcommand{\condstd}[2]{\mathrm{std} \left[#1 \,\middle|\, #2\right]}
\newcommand{\cov}[1]{\mathrm{cov} \left[#1\right]}
\newcommand{\condcov}[2]{\mathrm{cov} \left[#1 \,\middle|\, #2\right]}

\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\ceils}[1]{\left\lceil#1\right\rceil}
\newcommand*\dif{\mathop{}\!\mathrm{d}}
\newcommand{\floors}[1]{\left\lfloor#1\right\rfloor}
\newcommand{\I}[1]{\mathds{1} \! \left\{#1\right\}}
\newcommand{\maxnorm}[1]{\|#1\|_\infty}
\newcommand{\negpart}[1]{\left[#1\right]^-}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\normw}[2]{\|#1\|_{#2}}
\newcommand{\pospart}[1]{\left[#1\right]^+}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\subreal}[0]{\preceq}
\newcommand{\supreal}[0]{\succeq}
\newcommand{\T}{^\top}
\newcommand{\avein}{\frac{1}{n}\sum_{i=1}^n}
\newcommand{\avejn}{\frac{1}{n}\sum_{j=1}^n}
\newcommand{\bfone}{\mathbf{1}}
\newcommand{\bfonet}{\mathbf{1}^{\top}}
\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}

\DeclareMathOperator*{\argmax}{arg\,max\,}
\DeclareMathOperator*{\argmin}{arg\,min\,}
\let\det\relax
\DeclareMathOperator{\det}{det}
\DeclareMathOperator{\dom}{dom}
\DeclareMathOperator{\poly}{poly}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\prox}{\mathbf{prox}}
\DeclareMathOperator{\proj}{\Pi}
\let\trace\relax
\DeclareMathOperator{\trace}{tr}
\def\<#1,#2>{\left\langle #1,#2\right\rangle}
\mathchardef\mhyphen="2D
\allowdisplaybreaks

\title{A One-Sample Decentralized Proximal Algorithm\\ for Non-Convex Stochastic Composite Optimization\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<texiao@ucdavis.edu>?Subject=Your UAI 2023 paper}
{Tesi Xiao}{}}
\author[2]{\href{mailto:<xuxchen@ucdavis.edu>?Subject=Your UAI 2023 paper}{Xuxing Chen}{}}
\author[1]{Krishnakumar Balasubramanian}
\author[3]{Saeed Ghadimi}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of California, Davis
    % Davis, California, USA
}
\affil[2]{%
    Department of Mathematics\\
    University of California, Davis
    % Davis, California, USA
}
\affil[3]{%
    Department of Management Sciences\\
    University of Waterloo
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\textbf{Notations.} $\|\cdot\|$ denotes the $\ell_2$-norm for vectors and Frobenius norm for matrices. $\|\cdot\|_2$ denotes the spectral norm for matrices. $\mathbf{1}$ represents the all-one vector, and $\mI$ is the identity matrix as a standard practice. We identify vectors at agent $i$ in the subscript and use the superscript for the algorithm step. For example, the optimization variable of agent $i$ at step $k$ is denoted as $x^k_i$, and $z^k_i$ is the corresponding dual variable. We use uppercase bold letters to represent the matrix that collects all the variables from nodes (corresponding lowercase) as columns. We add an overbar to a letter to denote the average over all nodes. For example, we denote the optimization variables over all nodes at step $k$ as
\[\mX_k = \left[x_{1}^{k}, \dots, x_{n}^{k}\right].\]
The corresponding average over all notes can be thereby defined as
\begin{align*}
\bar x^k &= \avein x_{i}^{k} = \frac{1}{n}\mX_k \bfone,\\
\bar \mX_k &= [\bar x^k, \dots, \bar x^k] = \bar x^k \bfonet = \frac{1}{n}\mX_k \bfone \bfonet.
\end{align*}
For an extended valued function $\Psi: \realset^d \rightarrow \realset \cup \{+\infty\}$, its effective domain is written as $\dom(\Psi) = \{x \mid \Psi(x)<+\infty\}$. A function $\Psi$ is said to be proper if $\dom(\Psi)$ is non-empty. For any proper closed convex function $\Psi$, $x\in\realset^d$, and scalar $\gamma>0$, the proximal operator is defined as
\[
\prox_{\Psi}^{\gamma}(x) = \argmin_{y\in \realset^d}\left\{\frac{1}{2\gamma}\|y - x\|^2  + \Psi(y)\right\} .
\]
All random objects are properly defined in a probability space $(\Omega, \setF, \mathbb{P})$ and write  $x \in \cH$ if $x$ is $\cH$-measurable given a sub-$\sigma$-algebra $\cH \subseteq \setF$ and a random vector $x$. We use $\sigma(\cdot)$ to denote the $\sigma$-algebra generated by all the argument random vectors. Without loss of generality, we assume $n\geq 2$.

\begin{assumption}\label{aspt:gossipMatrix}
The adjacency matrix $\mW=(w_{ij})\in\realset^{n\times n}$ is symmetric and doubly stochastic, i.e.,
\begin{equation*}
    \mW = \mW^\top, \quad \mW \mathbf{1}_n = \mathbf{1}_n, \quad w_{ij}\geq 0, \forall i, j, 
\end{equation*}
and its eigenvalues satisfy $1=\lambda_1 > \lambda_2 \geq \dots \geq \lambda_n$ and $\rho\coloneqq \max\{|\lambda_2|, |\lambda_n|\}<1$.
\end{assumption}

\begin{assumption}\label{aspt:lipschitz-gradient}
All functions $\{F_{i}\}_{1\leq i\leq n}$ have Lipschitz continuous gradients with Lipschitz constants $L_{\nabla F_{i}}$, respectively. Therefore, $\nabla F$ is $L_{\nabla F}$-Lipchitz continous with $L_{\nabla F} ={\max}_{1\leq i\leq n} \{L_{\nabla F_i}\}$.
\end{assumption}

\begin{assumption}\label{aspt:Psi}
The function $\Psi: \realset^d \rightarrow \realset\cup\{+\infty\}$ is a closed proper convex function.
\end{assumption}

For stochastic oracles, we assume that each node $i$ at every iteration $k$ is able to obtain a local random data vector $\xi^{k}_i$. The induced natural filtration is given by $\setF_0 = \{\emptyset, \Omega\}$ and 
\[
    \setF_k \coloneqq \sigma\left(\xi^{t}_i \mid i =1 ,\dots, n, \, t=1,\dots, k \right), \forall k\geq 1.
\]
We require that the stochastic gradient $\nabla G_i(\cdot, \xi^{k+1}_i)$ is unbiased conditioned on the filteration $\setF_k$. 
\begin{assumption}[Unbiasness]\label{aspt: Unbiasness} For any $k\geq 0, x\in \setF_k$, and $1\leq i\leq n$, $\E\left[\nabla G_i(x, \xi^{k+1}_i)\mid \setF_k\right] = \nabla F_{i}(x).$
\end{assumption}
\begin{assumption}[Independence]\label{aspt: independence} For any $k\geq 0, 1\leq i, j\leq n, i\neq j,\ \xi_i^{k+1}$ is independent of $\setF_k$, and $\xi_i^{k+1}$ is independent of $\xi_j^{k+1}$.
\end{assumption}
In addition, we consider two standard assumptions on the variance and heterogeneity of stochastic gradients.
\begin{assumption}[Bounded variance]\label{aspt: Bounded Variance} For any $k\geq 0, x\in \setF_k$, and $1\leq i\leq n$, $$\E\left[\norm{\nabla G_i(x, \xi^{k+1}_i) - \nabla F_{i}(x)}^2\middle\vert \setF_k\right] \leq \sigma^2_i.$$
\end{assumption}

\begin{assumption}[Gradient heterogeneity]\label{aspt: Gradient heterogeneity} There exists a constant $\nu\geq0$ such that for all $1 \leq
i \leq n, x\in \realset^d$, 
\[
 \norm{\nabla F_i(x) - \nabla F(x)} \leq \nu.
\]
\end{assumption}


\begin{algorithm}[t]
    \caption{\texttt{Prox-DASA}}\label{algo: Prox-DASA}
    \SetAlgoLined
    \KwIn{$x_i^0 = z_i^0 = \mathbf{0}, \gamma, \{\alpha_k\}_{\geq 0}, m$}
    \For{$k=0, 1,\dots,K-1$}{
        \CommentSty{\# Local Update}\\
        \For{$i=1,2,\dots,n$ (in parallel)}{
            $y_i^k = \prox_{\Psi}^{\gamma}\left(x_{i}^{k} - \gamma z_{i}^{k}\right)$\\
            %$y_{i}^{k} = \argmin_{y\in \realset^d}\left\{\<z_{i}^{k}, y - x_{i}^{k}> + \frac{1}{2\gamma}\|y - x_{i}^{k}\|^2 + \Psi(y)\right\}$ \\
            $\tilde{x}_{i}^{k+1} = (1- \alpha_k)x_{i}^{k} + \alpha_ky_{i}^{k}$\\
            \CommentSty{\# Compute stochastic gradient}\\
            $v_{i}^{k+1} = \nabla G_{i}(x_{i}^{k}, \xi_{i}^{k+1})$\\
            % $\tilde u_i^{k+1} = u_i^k + v_i^{k+1} - v_i^k$ \\
            $\tilde{z}_{i}^{k+1} = (1 - \alpha_k)z_{i}^{k} + \alpha_k v_{i}^{k+1}$\\
        }
        \CommentSty{\# Communication}\\
        $[x_1^{k+1}, \dots, x_{n}^{k+1}] = [\tilde{x}_1^{k+1}, \dots, \tilde{x}_{n}^{k+1}]\mW^m$\\
        $[z_1^{k+1}, \dots, z_{n}^{k+1}] = [\tilde{z}_1^{k+1}, \dots, \tilde{z}_{n}^{k+1}]\mW^m$
    }
\end{algorithm}

\begin{algorithm}[t]
    \caption{\texttt{Prox-DASA-GT}}\label{algo: Prox-DASA-GT}
    \SetAlgoLined
    \KwIn{$x_i^0 = z_i^0 = \mathbf{0}, \gamma, \{\alpha_k\}_{\geq 0}, m$}
    \For{$k=0, 1,\dots,K$}{
        \CommentSty{\# Local Update}\\
        \For{$i=1,2,\dots,n$ (in parallel)}{
            $y_{i}^{k} = \prox_{\Psi}^{\gamma}\left(x_{i}^{k} - \gamma z_{i}^{k}\right)$\\
            $\tilde{x}_{i}^{k+1} = (1- \alpha_k)x_{i}^{k} + \alpha_ky_{i}^{k}$\\
            \CommentSty{\# Compute stochastic gradient}\\
            $v_{i}^{k+1} = \nabla G_{i}(x_{i}^{k}, \xi_{i}^{k+1})$\\
            $\tilde u_i^{k+1} = u_i^k + v_i^{k+1} - v_i^k$ \\
            $\tilde{z}_{i}^{k+1} = (1 - \alpha_k)z_{i}^{k} + \alpha_k u_{i}^{k}$\\
        }
        \CommentSty{\# Communication}\\
        $[x_1^{k+1}, \dots, x_{n}^{k+1}] = [\tilde{x}_1^{k+1}, \dots, \tilde{x}_{n}^{k+1}]\mW^m$\\
        $[u_1^{k+1}, \dots, u_{n}^{k+1}] = [\tilde{u}_1^{k+1}, \dots, \tilde{u}_{n}^{k+1}]\mW^m$\\
        $[z_1^{k+1}, \dots, z_{n}^{k+1}] = [\tilde{z}_1^{k+1}, \dots, \tilde{z}_{n}^{k+1}]\mW^m$
    }
\end{algorithm}


\section{Experimental Details}
\label{sec: exp_details}

All experiments in Section 5.2 are conducted on a laptop with Intel Core i7-11370H Processor and Windows 11 operating system. The total iteration numbers for a9a and MNIST are 10000 and 3000 respectively. The graph that represents the network topology is set to be ring (or cycle in graph theory) for a9a and random graph (given by \cite{mancino2022proximal}) for MNIST (See Figure \ref{fig: graphs}). %To demonstrate the performance of our algorithms in a constant batch size setting, the batch sizes are chosen to be $4$ for a9a and $32$ for MNIST in all algorithms. 
% We adjust the learning rates provided in the code of \cite{mancino2022proximal} accordingly and select the ones that have the best performance. 
% For \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} we choose a diminishing stepsize sequence, namely, $\alpha_k =  \min\left\{\alpha\sqrt{\frac{n}{k}}, 1\right\}$ for all $k\geq 0$. 
% Note that the same complexity (up to logarithmic factors) bounds can be obtained by directly plugging in the aforementioned expressions for $\alpha_k$ in Section 4.3. %Then we tune $\gamma\in \left\{1, 3, 10\right\}$ and $\alpha\in \left\{0.3, 1.0, 3.0\right\}$. The penalty parameter $\lambda$ is chosen to be $0.0001$ for all experiments.

We summarize the outputs of all experiments in Table \ref{table: output}, from which we can tell \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} achieve good performance in a relatively short amount of time. The stationarity is defined as $\|\cG(\bar x^k, \nabla F(\bar x^k), 1)\|^2 + \|\mX_k - \bar \mX_k\|^2$, which is the same as that in \cite{mancino2022proximal}. As mentioned in the caption of Figure 2 in the main paper, there is an extra hyperparameter $q$ in \texttt{ProxGT-SR-E}, and we found that large $q$ already works well for a9a experiment, but $q$ has to be small in the MNIST experiment otherwise the final accuracy will be much smaller than the one presented in Table \ref{table: output}. Hence in \texttt{ProxGT-SR-E} we choose $q=1000$ for a9a and $q=32$ for MNIST, and the plots that take this amount of epochs into account are in Figure \ref{fig: full_appendix}.\\



\begin{table*}[h]
\centering
\caption{Comparisons between all algorithms}\label{table: output}
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
\textbf{Algorithm} & \textbf{Accuracy} & \textbf{Training Loss} & \textbf{Stationarity} & \makecell{\bf Communication \\ \bf time per iteration (s)} & \makecell{\bf Computation \\ \bf time per iteration (s)} & \makecell{\bf Total time \\ \bf per iteration (s)} \\ \hline
\multicolumn{7}{|c|}{\bf a9a} \\ \hline
\texttt{SPPDM}              & 84.64\%             & 0.3340            & 0.0174                & 0.0260                      & 0.0305                    & 0.0565              \\ \hline
\texttt{ProxGT-SR-E}        & 76.38\%             & 0.6528             & 0.0797                & 0.0521                      & 0.0394                    & 0.0915              \\ \hline
\texttt{DEEPSTORM v2}          & \bf 84.90\%             & \bf 0.3274             & 0.0029                & 0.0525                      & 0.0398                    & 0.0923              \\ \hline
\texttt{Prox-DASA}          & 84.71\%             & 0.3338             & \bf 0.0017                & 0.0360                      & 0.0298                    & 0.0658              \\ \hline
\texttt{Prox-DASA-GT}       & 84.69\%             & 0.3342             &  \bf 0.0017                & 0.0390                      & 0.0301                    & 0.0691              \\ \hline
\multicolumn{7}{|c|}{\bf MNIST} \\ \hline
\texttt{SPPDM}              & 76.54\%           & 0.7854                 & 0.0436                & 0.1587                      & 0.1246                    & 0.2833              \\ \hline
\texttt{ProxGT-SR-E}        & 92.26\%           & 0.3042                 & 0.0250                & 0.1771                      & 0.3368                    & 0.5139              \\ \hline
\texttt{DEEPSTORM v2}          & 94.52\%           & 0.1759                 &  \bf 0.0016                & 0.1758                      & 0.2030                    & 0.3788              \\ \hline
\texttt{Prox-DASA}          & 96.74\%           & 0.1469                 & 0.0081                & 0.1912                      & 0.1299                    & 0.3211              \\ \hline
\texttt{Prox-DASA-GT}       & \bf 96.84\%           & \bf 0.1460                 & 0.0058                & 0.1935                      & 0.1317                    & 0.3252              \\ \hline
\end{tabular}
}
\end{table*}

\begin{figure*}[h]
    \centering
    \begin{tikzpicture}[shorten >=1pt,->]
      \tikzstyle{vertex}=[circle,fill=black!25,minimum size=6pt,inner sep=2pt]
      \node[vertex] (G_1) at (0, 1) {};
      \node[vertex] (G_2) at (0.707, 0.707)   {};
      \node[vertex] (G_3) at (1, 0)  {};
      \node[vertex] (G_4) at (0.707, -0.707)  {};
      \node[vertex] (G_5) at (0, -1)  {};
      \node[vertex] (G_6) at (-0.707, -0.707)  {};
      \node[vertex] (G_7) at (-1, 0)  {};
      \node[vertex] (G_8) at (-0.707, 0.707)  {};
      \draw (G_1) -- (G_2) -- (G_3) -- (G_4) -- (G_5) -- (G_6) -- (G_7) -- (G_8) -- (G_1) -- cycle;
      \node[vertex] (v1) at (3, 1) {};
      \node[vertex] (v2) at (3.707, 0.707)   {};
      \node[vertex] (v3) at (4, 0)  {};
      \node[vertex] (v4) at (3.707, -0.707)  {};
      \node[vertex] (v5) at (3, -1)  {};
      \node[vertex] (v6) at (3-0.707, -0.707)  {};
      \node[vertex] (v7) at (3-1, 0)  {};
      \node[vertex] (v8) at (3-0.707, 0.707)  {};
      \draw (v1) -- (v2) -- (v3) -- (v6) -- (v4) -- (v5) -- (v7) -- (v8) -- (v1) --cycle;
      \draw (v1) -- (v3) -- (v7) -- (v4) -- (v8) -- (v2) -- (v5) -- (v6) -- (v1) -- cycle;
      \draw (v1) -- (v4) -- (v2) -- (v6) -- (v7) -- (v1) --cycle;
      \draw (v1) -- (v5) -- (v8) -- cycle;
      \draw (v3) -- (v8) -- cycle;
    \end{tikzpicture}
    \caption{Network topology. The left represents the ring topology and the right represents the random graph.}
    \label{fig: graphs}
\end{figure*}

% \begin{table*}[t]
% \centering
% \caption{Results of MNIST}
% \begin{tabular}{|c|c|c|c|c|c|c|}
% \hline
% \textbf{Algorithm} & \textbf{Accuracy} & \textbf{Training Loss} & \textbf{Stationarity} & \makecell{\bf Communication \\ \bf time per iteration} & \makecell{\bf Computation \\ \bf time per iteration} & \makecell{\bf Total time \\ \bf per iteration} \\ \hline
% \texttt{SPPDM}              & 76.54\%           & 0.7854                 & 0.0436                & 0.1587                      & 0.1246                    & 0.2833              \\ \hline
% \texttt{ProxGT-SR-E}        & 92.26\%           & 0.3042                 & 0.0250                & 0.1771                      & 0.3368                    & 0.5139              \\ \hline
% \texttt{DEEPSTORM v2}          & 94.52\%           & 0.1759                 & 0.0016                & 0.1758                      & 0.2030                    & 0.3788              \\ \hline
% \texttt{Prox-DASA}          & 96.74\%           & 0.1469                 & 0.0081                & 0.1912                      & 0.1299                    & 0.3211              \\ \hline
% \texttt{Prox-DASA-GT}       & 96.84\%           & 0.1460                 & 0.0058                & 0.1935                      & 0.1317                    & 0.3252              \\ \hline
% \end{tabular}
% \end{table*}


\begin{figure*}
    \centering
    \subfigure[]{\label{a9a_acc_appendix}\includegraphics[width=0.32\textwidth]{Figures/a9a_acc_epo_full.pdf}}
    \subfigure[]{\label{a9a_loss_appendix}\includegraphics[width=0.32\textwidth]{Figures/a9a_train_loss_epo_full.pdf}}
    \subfigure[]{\label{a9a_stat_appendix}\includegraphics[width=0.32\textwidth]{Figures/a9a_stat_epo_full.pdf}}

    
    \subfigure[]{\label{mnist_acc_appendix}\includegraphics[width=0.32\textwidth]{Figures/mnist_acc_epo_full.pdf}}
    \subfigure[]{\label{mnist_loss_appendix}\includegraphics[width=0.32\textwidth]{Figures/mnist_train_loss_epo_full.pdf}}
    \subfigure[]{\label{mnist_stat_appendix}\includegraphics[width=0.32\textwidth]{Figures/mnist_stat_epo_full.pdf}}
    \caption{Comparisons between \texttt{SPPDM} \citep{wang2021distributed}, \texttt{ProxGT-SR-E} \citep{xin2021stochastic}, \texttt{DEEPSTORM} \citep{mancino2022proximal}, \texttt{Prox-DASA} \ref{algo: Prox-DASA}, and \texttt{Prox-DASA-GT} \ref{algo: Prox-DASA-GT}. In each experiments \texttt{ProxGT-SR-E} computes 1 more epoch than other algorithms every $q$ iterations. $q$ is chosen to be 1000 for a9a and 32 for MNIST.}\label{fig: full_appendix}
\end{figure*}




\section{Accelerated Consensus}
 When the number of communication round $m>1$, we can replace $\mW^m$ with the Chebyshev mixing protocol described in Algorithm \ref{algo: acc-consensus}.

\begin{algorithm}[ht]
    \caption{Chebyshev Mixing Protocol}\label{algo: acc-consensus}
    \SetAlgoLined
    \KwIn{Matrix $\mX$, mixing matrix $\mW$, rounds $m$}
    Set $\mA_0= \mX, \mA_1 = \mX\mW, \rho=\max\{|\lambda_2(\mW)|, |\lambda_n(\mW)|\}<1, \mu_0 = 1, \mu_1 = \frac{1}{\rho}$\\
    \For{$t=1,\dots,m-1$}{
        $\mu_{t+1} = \frac{2}{\rho}\mu_t - \mu_{t-1}$\\
        $\mA_{t+1} = \frac{2\mu_t}{\rho\mu_{t+1}}\mA_t \mW - \frac{\mu_{t-1}}{\mu_{t+1}} \mA_{t-1}$
    }
    \KwOut{$\mA_m$}
\end{algorithm}

Then, we have the following lemma.
\begin{lemma}\label{lem: chebyshev_mixing}
    Suppose $\mW$ satisfies Assumption \ref{aspt:gossipMatrix}. Let $\mA_0, \mA_m$ be the input and output matrix of Algorithm \ref{algo: acc-consensus} respectively. Then, we have
    \[
        \norm{\mA_m - \bar \mA_m} \leq 2\left( 1-\sqrt{1-\rho} \right)^m\norm{\mA_0 - \bar \mA_0}.
    \]
\end{lemma}
Hence, we obtain a linear convergence rate of $\left( 1-\sqrt{1-\rho} \right)$ instead of $\rho$. By virtue of that, we can set $m=\lceil \frac{1}{\sqrt{1-\rho}} \rceil$ to obtain a topology-independent iteration complexity.

\section{Convergence Analysis}

We present the complete proof in this section. In the sequel, $\|\cdot\|$ denotes the $\ell_2$-norm for vectors and Frobenius norm for matrices. $\|\cdot\|_2$ denotes the spectral norm for matrices. $\mathbf{1}$ represents the all-one vector. We identify vectors at agent $i$ in the subscript and use the superscript for the algorithm step. For example, the optimization variable of agent $i$ at step $k$ is denoted as $x^k_i$, and $z^k_i$ is the corresponding dual variable. We use uppercase bold letters to represent the matrix that collects all the variables from agents (corresponding lowercase) as columns. To be specific,
\[
\mX_k = \left[x_{1}^{k}, \dots, x_{n}^{k}\right], \quad \mZ_k = \left[z_{1}^{k}, \dots, z_{n}^{k}\right], \quad  \mY_k = \left[y_{1}^{k},\dots, y_{n}^{k}\right], \quad \mV_{k+1} = \left[ v^{k+1}_1, \dots, v^{k+1}_n \right].
\]
We add an overbar to a letter to denote the average over all agents. For example, 
\[
\bar x^k = \avein x_{i}^{k} = \frac{1}{n}\mX_k \bfone, \quad \bar \mX_k = [\bar x^k, \dots, \bar x^k] = \bar x^k \bfonet = \frac{1}{n}\mX_k \bfone \bfonet
\]
Hence, the consensus errors for iterates $\{x^k_i\}$ and dual variables $\{z^k_i\}$ can be written as 
\[
\avein \norm{x^k_i - \bar x^k}^2  = \frac{1}{n}\norm{\mX_k - \bar \mX_k}^2,\quad \avein \norm{z^k_i - \bar z^k}^2  = \frac{1}{n}\norm{\mZ_k - \bar \mZ_k}^2.
\]
We denote $L_{\nabla F} = \underset{1\leq i\leq n}{\max} \{L_{\nabla F_i}\}$ for ease of presentation. Our proof heavily relies on the merit function below:
\begin{equation}\label{def: merit_fun}
    W(\bar x^k,\bar z^k) = \underbrace{\Phi(\bar x^{k}) - \Phi_*}_{\text{function value gap}} + \underbrace{\Psi(\bar x^k) - \eta(\bar x^{k}, \bar z^{k})}_{\text{primal convergence}} + \lambda \underbrace{\norm{\nabla F(\bar x^k) - \bar z^k}^2}_{\text{dual convergence}},
\end{equation}
where
\begin{equation}\label{def: eta}
    \eta(x, z) = \min_{y\in \realset^d}\left\{\<z,y-x> + \frac{1}{2\gamma}\|y-x\|^2 + \Psi(y)\right\}.
\end{equation}


\subsection{Technical Lemmas}

\begin{lemma}\label{lem: F_norm_ineq}
    For any $p, q, r\in \naturalset_+$ and matrix $\mA\in \realset^{p\times q}, \mB\in \realset^{q\times r}$, we have:
    \[
        \|\mA\mB\| \leq \min\left(\|\mA\|_2\cdot \|\mB\|, \|\mA\|\cdot \|\mB\T\|_2\right).
    \]
\end{lemma}

\begin{lemma}\label{lem: W_m}
    Suppose $\mW$ satisfies Assumption \ref{aspt:gossipMatrix}. For any $m\in \naturalset_+$, we have
    \[
        \norm{\mW^m - \frac{\bfone_n\bfonet_n}{n}}_2\leq \rho^m
    \]
\end{lemma}


\begin{lemma}\label{lem: cons_decay}
    Suppose we are given three sequences $\{a_n\}_{n=0}^{\infty}, \{b_n\}_{n=0}^{\infty},\ \{\tau_n\}_{n=-1}^{\infty},$ and a constant $r$ satisfying
    \begin{equation}\label{ineq: conditions_cons_decay}
        a_{k+1}\leq r a_k + b_{k},\ a_k\geq 0,\ b_k\geq 0,\  0 = \tau_{-1}\leq \tau_{k+1}\leq \tau_k\leq 1,
    \end{equation}
    for all $k\geq 0$. Then for any $K > 0$, we have
    \[
        \sum_{k=0}^{K}\tau_ka_k\leq \frac{1}{1-r}\left(\tau_0a_0 + \sum_{k=0}^{K}\tau_kb_k\right)
    \]
\end{lemma}
\begin{proof}
    Note that we have
    \begin{align*}
        (1-r)\sum_{k=0}^{K}\tau_ka_k \leq \sum_{k=0}^{K}\tau_k(a_k - a_{k+1} + b_k) = \sum_{k=0}^{K}(\tau_k - \tau_{k-1})a_k - \tau_Ka_{K+1} + \sum_{k=0}^{K}\tau_kb_k\leq \tau_0a_0 + \sum_{k=0}^{K}\tau_kb_k,
    \end{align*}
    where the inequalities use \eqref{ineq: conditions_cons_decay}, and the equality uses summation by parts.
\end{proof}


\begin{lemma}\label{lem:eta-smooth}
Let $\Psi: \realset^d \rightarrow \realset\cup\{+\infty\}$ be a closed proper convex function.
\begin{itemize}
\item[(a)] Let $\eta(x,z)$ be the function defined in \eqref{def: eta}. Then, $\nabla \eta$ is $C_\gamma$-Lipschitz continuous where
\begin{equation}
    C_{\gamma} = 2 \sqrt{(1+\frac{1}{\gamma})^2 + (1 + \frac{\gamma}{2})^2}.
\end{equation}
\item[(b)] For $x, z \in \realset^d$ and $\gamma\in\realset$, let $y_+ = \prox_{\Psi}^{\gamma}(x-\gamma z) = \underset{y\in \realset^d} {\argmin} \left\{\<z,y-x> + \frac{1}{2\gamma}\|y-x\|^2 + \Psi(y)\right\}$, then for any $y\in \realset^d$, we have
\begin{equation*}
    \Psi(y_+) - \Psi(y) \leq \< z + \gamma^{-1}(y_+ - x), y-y_+ >
\end{equation*}
\end{itemize}
\end{lemma}
\begin{proof}
We prove (a) at first. Recall that the Moreau envelope of a convex and closed function $\Psi$ multiplied by a scalar $\gamma$ is defined by
\begin{equation*}
    \text{env}_{\gamma \Psi}(x) = \underset{y\in\realset^d}{\min}\left\{\frac{1}{2\gamma}\norm{y-x}^2 + \Psi(y)\right\},
\end{equation*}
and its gradient is given by $\nabla \text{env}_{\gamma\Psi}(x) = \frac{1}{\gamma} (x - \prox_{\Psi}^{\gamma}(x))$ where $\prox_{\Psi}^{\gamma}(x)=\underset{y\in\realset^d}{\argmin}\left\{\frac{1}{2\gamma}\norm{y-x}^2 + \Psi(y)\right\}$. Note that $\eta(x, z) = \text{env}_{\gamma \Psi}\left(x-\gamma z\right) - \frac{\gamma}{2}\norm{z}^2$. Therefore, the partial gradients of $\eta$ are given by
\begin{equation}
    \nabla_x \eta (x, z) = -z - \gamma^{-1} \left(\prox_{\Psi}^{\gamma}\left(x-  \gamma z\right) - x\right), \quad \nabla_z \eta (x, z) =  \prox_{\Psi}^{\gamma}\left(x-  \gamma z\right) - x.
\end{equation}
Hence, for any $(x, z)$ and $(x', z')$,
\begin{align}
    \norm{\nabla \eta (x, z)  -\nabla \eta (x', z') } &\leq \norm{\nabla_x \eta (x, z) - \nabla_x \eta (x', z')} + \norm{\nabla_z \eta (x, z) - \nabla_z \eta (x', z')}\notag\\
    &\leq 2(1+1/\gamma) \norm{x - x'} + (2+\gamma) \norm{z- z'} \leq C_{\gamma} \norm{(x, z) - (x', z')}.\notag
\end{align}
To prove (b), denote the subdifferential of $\Psi(x)$ as $\partial \Psi(x)$. By the optimality condition, we have $\mathbf{0}$ is a subgradient of $H(y) = \<z,y-x> + \frac{1}{2\gamma}\|y-x\|^2 + \Psi(y)$ at $y_+$, i.e.,
\begin{equation*}
    \mathbf{0} \in z + \gamma^{-1}(y_+ - x) + \partial \Psi(y_+).
\end{equation*}
Hence, there exists a subgradient of $\Psi(y)$ at $y_+$, denoted by $\Tilde{\nabla}\Psi(y_+)$, such that
\begin{equation*}
    \Tilde{\nabla}\Psi(y_+) = - z - \gamma^{-1}(y_+ - x).
\end{equation*}
Finally, by the convexity of $\Psi$, we have for any $y\in\realset^d$,
\begin{equation*}
    \Psi(y) - \Psi(y_+) \geq \< \Tilde{\nabla} \Psi(y_+) , y-y_+> = \< - z - \gamma^{-1}(y_+ - x), y - y_+>,
\end{equation*}
which completes the proof.
\end{proof}




\subsection{Building Blocks of Main Proof}

The following lemma connects the consensus error of $\mY$ to the consensus errors of $\mX$ and $\mZ$.
\begin{lemma}\label{lem: consensus_y}
    Let $y^k_+ = \prox(\bar x^k - \gamma \bar z^k)$. Then for any $k\geq 0$ and $\gamma>0$, we have
    \begin{equation*}
        \norm{y^k_+ - \bar y^k}^2 + \frac{1}{n} \norm{\mY_k - \bar \mY_k}^2 = \frac{1}{n} \sum_{i=1}^{n} \norm{y_i^k - y^k_+}^2 \leq \frac{2}{n} \left\{\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2 \right\}.
    \end{equation*}
\end{lemma}

\begin{proof}
    By the non-expansiveness of proximal operator, we have
    \begin{equation}\label{ineq: yi_y_plus_v2}
        \|y_{i}^{k} - y^k_+\| \leq \|x_{i}^{k} - \bar x^k - \gamma \left(z_{i}^{k} - \bar z^k\right)\|\leq \|x_{i}^{k} - \bar x^k\| + \gamma\|z_{i}^{k} - \bar z^k\|.
    \end{equation}
    Hence we know the consensus error of $y$ can be bounded
    \begin{align}
        &\frac{1}{n}\|\mY_k - \bar \mY^k\|^2 =\avein \|y_{i}^{k} - \bar y^k\|^2 = \avein \|y_{i}^{k} - y^k_+ + \avejn(y^k_+ - y_{j}^{k})\|^2 \notag\\
        =&\avein\|y_{i}^{k} - y^k_+\|^2 - \|\avejn\left(y_j^{k} - y_+^{k}\right)\|^2 \leq \avein\|y_{i}^{k} - y^k_+\|^2\notag\\
        \leq &\frac{2}{n} \left\{\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2 \right\} \label{ineq: y_consensus}
    \end{align}
    where the third equality uses the fact that
    \[
        \avein\left\|v_i - \left(\avejn v_j\right)\right\|^2 = \avein\left\|v_i\right\|^2 - \left\|\avejn v_j\right\|^2
    \]
    for any vectors $v_i\ (1\leq i\leq n)$.
\end{proof}

The following technical lemma explicitly characterizes the consensus error.

\begin{lemma}[Conensus error of Algorithm \ref{algo: Prox-DASA}: \texttt{Prox-DASA}]\label{lem: consensus}
Suppose Assumptions \ref{aspt:gossipMatrix}, \ref{aspt: Unbiasness}, \ref{aspt: independence}, \ref{aspt: Bounded Variance}, and \ref{aspt: Gradient heterogeneity} hold.  Let $\varrho(m) = \frac{(1+\rho^{2m}) \rho^{2m}}{(1-\rho^{2m})^2}$, and $\rho, m$ and $\alpha_k$ satisfy
    \begin{equation}\label{ineq: alpha_m_condition}
       \varrho(m)\alpha_k^2\leq \min\left\{\frac{1}{8}, \frac{1}{24L_{\nabla F}^2\gamma^2}\right\},\ 0 = \alpha_{-1}\leq\alpha_{k+1}\leq \alpha_k\leq 1
    \end{equation}
    for any $k\geq 0$.
    % \begin{equation}\label{ineq: alpha_m_condition}
    %     m\geq \max\left\{\frac{\log 11}{2(1-\rho)}, \frac{\log (2n + 3)}{2(1-\rho)}\right\}, 0 = \alpha_{-1}\leq\alpha_{k+1}\leq \alpha_k\leq \min\left\{1, \frac{\gamma^{-1}}{8L_{\nabla F}}\right\}
    % \end{equation}
    Then in Algorithm \ref{algo: Prox-DASA} for any $p\geq 0$, we have
    \begin{align*}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq 4\gamma^2(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2},\notag \\
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]\leq 4(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2},\notag.
    \end{align*}
\end{lemma}
\begin{proof}
% From \eqref{ineq: alpha_m_condition} we know $m\geq \max\left\{\frac{\log 11}{2(1-\rho)}, \frac{\log (2n+3)}{2(1-\rho)}\right\}\geq \max\left\{\frac{\log 11}{2\log(\frac{1}{\rho})}, \frac{\log (2n+3)}{2\log(\frac{1}{\rho})}\right\}$ and hence
% \begin{equation}\label{ineq: m_rho}
%     \frac{(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}\leq\min\left\{\frac{1}{8},\frac{1}{2n}\right\}.
% \end{equation}
By Assumption \ref{aspt:gossipMatrix}, the iterates in Algorithm \ref{algo: Prox-DASA} satisfy
\begin{equation}\label{eq: xd2}
    \begin{aligned}
       &\mX_{k+1} = (1-\alpha_k)\mX_k \mW^{m} + \alpha_k\mY_k \mW^{m},\ \bar x^{k+1} = (1-\alpha_k)\bar x^k + \alpha_k \bar y^k, \\ 
       &\mZ_{k+1} = (1-\alpha_k)\mZ_k \mW^{m} + \alpha_k \mV_{k+1} \mW^{m},\ \bar z^{k+1} = (1-\alpha_k)\bar z^k + \alpha_k \bar v^{k+1}. 
    \end{aligned}
\end{equation}
Hence, for the consensus error of iterates $\{x^k_i\} $, we have
    \begin{align}
        &\norm{\mX_{k+1} - \bar \mX_{k+1}}^2 \notag \\
        = &\left\|\bigg((1-\alpha_k)\left(\mX_k - \bar \mX_k\right) + \alpha_k\left(\mY_k - \bar \mY_k\right)\bigg)\left(\mW^m - \frac{\bfone\bfonet}{n}\right)\right\|^2 \notag\\
        \leq &\left\{\left(1 + \frac{1-\rho^{2m}}{2\rho^{2m}}\right)(1-\alpha_k)^2\norm{\mX_k - \bar \mX_k}^2 + \left(1 + \frac{2\rho^{2m}}{1-\rho^{2m}}\right)\alpha_k^2\norm{\mY_k - \bar \mY_k}^2\right\} \rho^{2m} \notag \\
        \leq &\frac{(1+\rho^{2m})}{2}\norm{\mX_k - \bar \mX_k}^2 + \frac{(1+\rho^{2m})\rho^{2m}}{1-\rho^{2m}}\alpha_k^2\norm{\mY_k - \bar \mY_k}^2, \label{ineq: x_consensus}
    \end{align}
    where the first inequality uses Lemma \ref{lem: F_norm_ineq} and \ref{lem: W_m}.  Combining \eqref{ineq: alpha_m_condition}, \eqref{ineq: x_consensus}, and Lemma \ref{lem: consensus_y}, we have
    \begin{align*}
        \E\left[\|\mX_{k+1} - \bar \mX_{k+1}\|^2\right]&\leq \frac{(1+\rho^{2m})}{2}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{(1-\rho^{2m})}{4}\E\left[\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2\right] \\
        &= \frac{(3 + \rho^{2m})}{4}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{(1-\rho^{2m})\gamma^2}{4}\E\left[\|\mZ_{k} - \bar \mZ_k\|^2\right]
    \end{align*}
    Using Lemma \ref{lem: cons_decay} in the above inequality with $\tau_k = \frac{\alpha_k^p}{n}$ for any fixed $p\geq 0$ we know
    \begin{equation}\label{ineq: x_z_cons}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq \sum_{k=0}^{K}\frac{\gamma^2\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right].
    \end{equation}
    Similarly to \eqref{ineq: x_consensus}, we can obtain the following results on the consensus error of dual variables $\{z^k_i\}$:
    \begin{equation}\label{ineq: z_consensus}
        \norm{\mZ_{k+1} - \bar \mZ_{k+1}}^2 \leq \frac{(1+\rho^{2m})}{2}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{(1+\rho^{2m})\rho^{2m}}{1-\rho^{2m}}\alpha_k^2\norm{\mV_{k+1} - \bar \mV_{k+1}}^2,
    \end{equation}
    Using \eqref{ineq: alpha_m_condition} and Lemma \ref{lem: cons_decay} in \eqref{ineq: z_consensus} with $\tau_k = \frac{\alpha_k^p}{n}$, we have
    \begin{align}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]\leq 2\varrho(m)\sum_{k=0}^{K}\frac{\alpha_k^{p+2}}{n}\E\left[\|\mV_{k+1} - \bar \mV_{k+1}\|^2\right].\label{ineq: z_cons_v}
        % \leq \sum_{k=0}^{K}\frac{\alpha_k^{p}}{8nL_{\nabla F}^2\gamma^2}\E\left[\|\mV_{k+1} - \bar \mV_{k+1}\|^2\right]
        % &\leq \frac{2(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}\sum_{k=0}^{K} \left\{\frac{6L_{\nabla F}^2 \alpha_k^{p+2}}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + (\sigma^2 + 3L_{\nabla F}^2\nu^2)\sum_{k=0}^{K}\alpha_k^{p+2}\right\}\\
        % &\leq \frac{3(1+\rho^{2m})\rho^{2m}}{16(1-\rho^{2m})^2}\sum_{k=0}^{K} \frac{\alpha_k^{p}}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right] + \frac{2(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2} (\sigma^2 + 3L_{\nabla F}^2\nu^2)\sum_{k=0}^{K}\alpha_k^{p+2}.
    \end{align}
    To bound $\|\mV_{k+1} - \bar \mV_{k+1}\|$ we first notice that
    \begin{align*}
        v_{i}^{k+1} - \bar v^{k+1}=&v_i^{k+1} - \E\left[v_i^{k+1}|\setF_k\right] - \avejn(v_{j}^{k+1} - \E\left[v_j^{k+1}|\setF_k\right]) \\
        +&\E\left[v_i^{k+1}|\setF_k\right] - \nabla F_i(\bar x^k) + \nabla F_i(\bar x^k) - \nabla F(\bar x^k) + \nabla F(\bar x^k) - \avejn\E\left[v_j^{k+1}|\setF_k\right] \\
        =&\left(1-\frac{1}{n}\right)(v_i^{k+1} - \E\left[v_i^{k+1}|\setF_k\right]) - \frac{1}{n}\sum_{j\neq i}(v_{j}^{k+1} - \E\left[v_j^{k+1}|\setF_k\right])\\
        + &\left(1-\frac{1}{n}\right)\left(\nabla F_i(x_i^{k}) - \nabla F_i(\bar x^k)\right) + \nabla F_i(\bar x^k) - \nabla F(\bar x^k) + \frac{1}{n}\sum_{j\neq i}\left(\nabla F_j(\bar x^k) - \nabla F_i(x_j^{k})\right)
    \end{align*}
    which gives
    \begin{align*}
        &\E\left[\|v_{i}^{k+1} - \bar v^{k+1}\|^2\right]\\
        = &\left(1-\frac{1}{n}\right)^2\E\left[\|v_i^{k+1} - \E\left[v_i^{k+1}|\setF_k\right]\|^2\right] + \frac{1}{n^2}\sum_{j\neq i}^{n}\E\left[\|v_{j}^{k+1} - \E\left[v_j^{k+1}|\setF_k\right]\|^2\right] \\
        + & \left\|\left(1-\frac{1}{n}\right)\left(\nabla F_i(x_i^{k}) - \nabla F_i(\bar x^k)\right) + \nabla F_i(\bar x^k) - \nabla F(\bar x^k) + \frac{1}{n}\sum_{j\neq i}\left(\nabla F_j(\bar x^k) - \nabla F_i(x_j^{k})\right)\right\|^2 \\
        \leq &\sigma^2 + 3L_{\nabla F}^2\left(\left(1-\frac{1}{n}\right)^2\|x_i^{k} - \bar x^{k}\|^2 + \nu^2 + \frac{1}{n}\sum_{j\neq i}\|x_j^k - \bar x^k\|^2\right),
    \end{align*}
    where the first equality uses Assumption \ref{aspt: independence}, and the second inequality uses Cauchy-Schwarz inequality, Assumptions \ref{aspt:lipschitz-gradient}, \ref{aspt: Bounded Variance}, and \ref{aspt: Gradient heterogeneity}. Hence we have
    \begin{equation}\label{ineq: V_cons}
        \E\left[\|\mV_{k+1} - \bar \mV_{k+1}\|^2\right] \leq 6L_{\nabla F}^2\E\left[\|\mX_k - \bar \mX_k\|^2\right] + n\sigma^2 + 3nL_{\nabla F}^2\nu^2.
    \end{equation}
    Combining \eqref{ineq: z_cons_v} and \eqref{ineq: V_cons}, we have
    \begin{equation}\label{ineq: z_x_cons}
        \begin{aligned}
            \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right] \leq &2\varrho(m)\sum_{k=0}^{K} \left\{\frac{6L_{\nabla F}^2 \alpha_k^{p+2}}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + (\sigma^2 + 3L_{\nabla F}^2\nu^2)\sum_{k=0}^{K}\alpha_k^{p+2}\right\} \\
            \leq &\sum_{k=0}^{K}\left\{12 \varrho(m)\alpha_k^2 L_{\nabla F}^2 \gamma^2\right\}\frac{\alpha_k^{p}}{n\gamma^2}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + 2(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2} \\
            \leq &\sum_{k=0}^{K}\frac{\alpha_k^p}{2n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right] + 2(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2},
        \end{aligned}
    \end{equation}
    % \begin{equation}\label{ineq: z_x_cons}
    %     \begin{aligned}
    %         \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]&\leq \sum_{k=0}^{K}\frac{6L_{\nabla F}^2\alpha_k^{p+2}}{n^2}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{\sigma^2 + 3L_{\nabla F}^2\nu^2}{n}\sum_{k=0}^{K}\alpha_k^{p+2} \\
    %         &\leq \sum_{k=0}^{K}\frac{\alpha_k^p}{2\gamma^2n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{\sigma^2 + 3L_{\nabla F}^2\nu^2}{n}\sum_{k=0}^{K}\alpha_k^{p+2},
    %     \end{aligned}
    % \end{equation}
    where the second inequality uses \eqref{ineq: alpha_m_condition}. By \eqref{ineq: x_z_cons} and \eqref{ineq: z_x_cons} we can finally obtain that
    \begin{align}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq 4\gamma^2(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2},\label{ineq: x_cons_final}, \\
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]\leq 4(\sigma^2 + 3L_{\nabla F}^2\nu^2)\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2},\label{ineq: z_cons_final}.
    \end{align}
\end{proof}
% {\bf Remark:} Note that a sufficient condition to ensure \eqref{ineq: alpha_m_condition} is
% \[
%     \alpha_k \leq \min\left\{\frac{1}{4}, \frac{1}{7L_{\nabla F}\gamma}\right\}(\frac{1}{\rho^m} - \rho^m)
% \]

\begin{lemma}[Conensus error of Algorithm \ref{algo: Prox-DASA-GT}: \texttt{Prox-DASA-GT}]\label{lem: consensus_gt}
    Suppose Assumptions \ref{aspt:gossipMatrix}, \ref{aspt: Unbiasness}, \ref{aspt: Bounded Variance} and \ref{aspt: independence} hold. Let $\varrho(m) = \frac{(1+\rho^{2m}) \rho^{2m}}{(1-\rho^{2m})^2}$, and $\rho, m$ and $\alpha_k$ satisfy
    \begin{equation}\label{ineq: alpha_m_condition_gt}
        \varrho(m)\alpha_k^2\leq \frac{1}{8},\ \varrho(m)\alpha_k\leq \frac{1}{9L_{\nabla F}\gamma},\ 0 = \alpha_{-1}\leq\alpha_{k+1}\leq \alpha_k\leq 1
    \end{equation}
    % \begin{equation}\label{ineq: alpha_m_condition_gt}
    %     m\geq \max\left\{\frac{\log 11}{2(1-\rho)}, \frac{\log (2n + 3)}{2(1-\rho)}\right\}, 0 = \alpha_{-1}\leq\alpha_{k+1}\leq \alpha_k\leq \min\left\{1, \frac{\gamma^{-1}}{8L_{\nabla F}}\right\}
    % \end{equation}
    for any $k\geq 0$, and the initialization satisfies $u_i^0 = v_i^0= 0$ for all $i$. Then in Algorithm \ref{algo: Prox-DASA-GT} for any $p\geq 0$ we have
    \begin{align*}
        &\sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq 40 \gamma^2 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\},\\
        &\sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]\leq 40 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\}.
    \end{align*}
\end{lemma}
\begin{proof}
    % By \eqref{ineq: alpha_m_condition_gt} we can obtain
    % \begin{equation}\label{ineq: m_rho_gt}
    %     \frac{(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}\leq\min\left\{\frac{1}{8},\frac{1}{2n}\right\}.
    % \end{equation}
    The updates in Algorithm \ref{algo: Prox-DASA-GT} take the form:
    \begin{equation}\label{eq: xd2_gt}
        \begin{aligned}
           &\mX_{k+1} = (1-\alpha_k)\mX_k \mW^{m} + \alpha_k\mY_k \mW^{m},\ \bar x^{k+1} = (1-\alpha_k)\bar x^k + \alpha_k \bar y^k, \\ 
           &\mU_{k+1} = \mU_k \mW^{m} + (\mV_{k+1} - \mV_k) \mW^{m},\ \bar u^{k+1} = \bar u^k + \bar v^{k+1} - \bar v^k, \\
           &\mZ_{k+1} = (1-\alpha_k)\mZ_k \mW^{m} + \alpha_k \mU_{k} \mW^{m},\ \bar z^{k+1} = (1-\alpha_k)\bar z^k + \alpha_k \bar u^{k}. 
        \end{aligned}
    \end{equation}
    Setting $u_i^0 = v_i^0$, we can prove by induction that $\bar u^k = \bar v^k$. To analyze the consensus error of $\mU_k$, we first notice:
    \begin{align*}
        &\mU_{k+1} - \bar \mU_{k+1} \\
        = & \left(\mU_k - \bar \mU_k + \mV_{k+1} - \mV_k - \bar \mV^{k+1} + \bar \mV^k\right)\left(\mW^m - \frac{\bfone\bfonet}{n}\right)\\
        = &\left(\mU_k - \bar \mU_k + \left(\mV_{k+1} - \mV_k\right)\left(\mI - \frac{\bfone\bfonet}{n}\right)\right)\left(\mW^m - \frac{\bfone\bfonet}{n}\right)
    \end{align*}
    which gives
    \begin{align*}
        &\|\mU_{k+1} - \bar \mU_{k+1}\|^2\\
        \leq &\left\{\left(1 + \frac{1-\rho^{2m}}{2\rho^{2m}}\right)\norm{\mU_k - \bar \mU_k}^2 + \left(1 + \frac{2\rho^{2m}}{1-\rho^{2m}}\right)\norm{\mV_{k+1} -  \mV_k}^2\right\} \rho^{2m} \\
        = &\frac{(1+\rho^{2m})}{2}\norm{\mU_k - \bar \mU_k}^2 + \frac{(1+\rho^{2m})\rho^{2m}}{1-\rho^{2m}}\norm{\mV_{k+1} -\mV_k}^2.
    \end{align*}
    Using Lemma \ref{lem: cons_decay}, we know for any $k\geq 0$ and $p\geq 0$,
    \begin{equation}\label{ineq: U_V_cons}
        \sum_{k=0}^{K}\alpha_k^p\|\mU_{k} - \bar \mU_{k}\|^2\leq2\varrho(m) \sum_{k=0}^{K}\alpha_k^p\norm{\mV_{k+1} -\mV_k}^2.
    \end{equation}
    Note that we also have
    \begin{align*}
        \mV_{k+1} - \mV_k = &\mV_{k+1} - \E\left[\mV_{k+1}|\setF_k\right] - \left(\mV_k - \E\left[\mV_k|\setF_{k-1}\right]\right) \\
        &+ \E\left[\mV_{k+1}|\setF_k\right] - \nabla \mF(\bar x^k) + \nabla \mF(\bar x^k) - \nabla \mF(\bar x^{k-1}) + \nabla \mF(\bar x^{k-1}) - \E\left[\mV_k|\setF_{k-1}\right]
    \end{align*}
    where we overload the notation and define $\nabla \mF(x) = [\nabla F_1(x), ..., \nabla F_n(x)]$. Hence we know
    \begin{equation}\label{ineq: delta_V}
        \begin{aligned}
            &\E\left[\|\mV_{k+1} - \mV_k\|^2\right]\\
        \leq & 5\bigg\{\E\left[\|\mV_{k+1} - \E\left[\mV_{k+1}|\setF_k\right]\|^2\right] + \E\left[\|\mV_k - \E\left[\mV_k|\setF_{k-1}\right]\|^2\right] + \E\left[\sum_{i=1}^{n}\|\nabla F_i(x_i^k) - \nabla F_i(\bar x^k)\|^2\right] \\
        + & \E\left[\sum_{i=1}^{n}\|\nabla F_i(\bar x^k) - \nabla F_i(\bar x^{k-1})\|^2\right] + \E\left[\sum_{i=1}^{n}\|\nabla F_i(x_i^{k-1}) - \nabla F_i(\bar x^{k-1})\|^2\right]\bigg\} \\
        \leq &5\left(2n\sigma^2 + L_{\nabla F}^2\E\left[\|\mX_k - \bar \mX_k\|^2 + \|\mX^{k-1} - \bar \mX^{k-1}\|^2 + n\alpha_{k-1}^2\|\bar x^{k-1} - \bar y^{k-1}\|^2\right]\right)
        \end{aligned}
    \end{equation}
    % \|\mX_k - \bar \mX_k\|^2 + \|\mX^{k-1} - \bar \mX^{k-1}\|^2 + n\alpha_{k-1}^2\|\bar x^{k-1} - \bar y^{k-1}\|^2
    % \begin{align*}
    %     &\|V_{k+1} - V_k\|^2 \\
    %     = &\|\E\left[V_{k+1}|\setF_k\right] - V_k\|^2 + \|V_{k+1} - \E\left[V_{k+1}|\setF_k\right]\|^2 + 2\<\E\left[V_{k+1}|\setF_k\right] - V_k, V_{k+1} - \E\left[V_{k+1}|\setF_k\right]>,
    % \end{align*}
    % which gives
    % \begin{align*}
    %     &\E\left[\|V_{k+1} - V_k\|^2|\setF_k\right] \\
    %     = &\|\E\left[V_{k+1}|\setF_k\right] - V_k\|^2 + \E\left[\|V_{k+1} - \E\left[V_{k+1}|\setF_k\right]\|^2|\setF_k\right] \\
    %     \leq &2\|\E\left[V_{k+1}|\setF_k\right] - \E\left[V_k|\setF_{k-1}\right]\|^2 + 2\|V_k - \E\left[V_k|\setF_{k-1}\right]\|^2 + \E\left[\|V_{k+1} - \E\left[V_{k+1}|\setF_k\right]\|^2|\setF_k\right].
    % \end{align*}
    % Note that
    % \begin{align*}
    %     \|\E\left[V_{k+1}|\setF_k\right] - \E\left[V_k|\setF_{k-1}\right]\|^2 \leq L_{\nabla F}^2\|\mX_k - \mX^{k-1}\|^2\leq 3L_{\nabla F}^2\left(\|\mX_k - \bar \mX_k\|^2 + \|\mX^{k-1} - \bar \mX^{k-1}\|^2 + n\alpha_{k-1}^2\|\bar x^{k-1} - \bar y^{k-1}\|^2\right)
    % \end{align*}
    where the first inequality uses Cauchy-Schwarz inequality, and the second inequality uses Lipschitz continuity of $\nabla f_i$ and \eqref{eq: xd2_gt}. For simplicity we set $x_i^{-1} = y_i^{-1} = 0$ for all $i$ so that it is easy to check the above inequality holds for all $k\geq 0$. Using \eqref{ineq: U_V_cons} and \eqref{ineq: delta_V} we know:
    \begin{align}
        &\sum_{k=0}^{K}\frac{\alpha_k^p}{n}\|\mU_{k} - \bar \mU_{k}\|^2\\
        \leq &\frac{10\varrho(m)}{n} \sum_{k=0}^{K}\alpha_k^p\left(2n\sigma^2 + L_{\nabla F}^2\E\left[\|\mX_k - \bar \mX_k\|^2 + \|\mX^{k-1} - \bar \mX^{k-1}\|^2 + n\alpha_{k-1}^2\|\bar x^{k-1} - \bar y^{k-1}\|^2\right]\right). \notag \\
            \leq &\frac{20L_{\nabla F}^2\varrho(m)}{n}\sum_{k=0}^{K}\alpha_k^p\E\left[\|\mX_k - \bar \mX_k\|^2\right] + 10L_{\nabla F}^2\varrho(m)\sum_{k=0}^{K}\alpha_k^{p+2}\E\left[\|\bar x^k - \bar y^k\|^2\right] +20\sigma^2 \varrho(m)\sum_{k=0}^{K}\alpha_k^{p} \label{ineq: U_cons_gt},
            % \leq & \sum_{k=0}^{K}\frac{\alpha_k^{p-2}}{5\gamma^2n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{10L_{\nabla F}^2(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}\sum_{k=0}^{K}\alpha_k^{p+2}\E\left[\|\bar x^k - \bar y^k\|^2\right] + \frac{40\rho^{2m}\sigma^2}{(1-\rho^{2m})^2}\sum_{k=0}^{K}\alpha_k^{p},
    \end{align}
    where the third inequality uses \eqref{ineq: alpha_m_condition_gt}.
    % \begin{align}
    %     \sum_{k=0}^{K}\alpha_k^p\|\mU_{k} - \bar \mU_{k}\|^2&\leq \frac{20}{n^2}\sum_{k=0}^{K}\alpha_k^p\left(2n\sigma^2 + L_{\nabla F}^2\E\left[\|\mX_k - \bar \mX_k\|^2 + \|\mX^{k-1} - \bar \mX^{k-1}\|^2 + n\alpha_{k-1}^2\|\bar x^{k-1} - \bar y^{k-1}\|^2\right]\right). \notag \\
    %         &\leq \frac{40L_{\nabla F}^2}{n^2}\sum_{k=0}^{K}\alpha_k^p\E\left[\|\mX_k - \bar \mX_k\|^2\right] + \frac{20L_{\nabla F}^2}{n}\sum_{k=0}^{K}\alpha_k^{p+2}\E\left[\|\bar x^k - \bar y^k\|^2\right] + \frac{40\sigma^2}{n}\sum_{k=0}^{K}\alpha_k^{p} \label{ineq: U_cons_gt}.
    % \end{align}
    For other consensus error terms we follow the same proof in Lemma \ref{lem: consensus} to get
    \begin{align}
        &\norm{\mX_{k+1} - \bar \mX_{k+1}}^2\leq \frac{(1+\rho^{2m})}{2}\norm{\mX_k - \bar \mX_k}^2 + \frac{(1+\rho^{2m})\rho^{2m}}{1-\rho^{2m}}\alpha_k^2\norm{\mY_k - \bar \mY_k}^2, \label{ineq: X_cons_gt}\\
        &\norm{\mY_k - \bar \mY_k}^2 \leq 2(\norm{\mX_k - \bar \mX_k}^2 + \gamma^2\norm{\mZ_k - \bar \mZ_k}^2), \label{ineq: Y_cons_gt}\\
        &\norm{\mZ_{k+1} - \bar \mZ_{k+1}}^2 \leq \frac{(1+\rho^{2m})}{2}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{(1+\rho^{2m})\rho^{2m}}{1-\rho^{2m}}\alpha_k^2\norm{\mU_k - \bar \mU_k}^2 \label{ineq: Z_cons_gt}.
    \end{align}
    Hence we know \eqref{ineq: x_z_cons} still holds:
    \begin{equation}\label{ineq: x_z_cons_gt}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq \sum_{k=0}^{K}\frac{\gamma^2\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right].
    \end{equation}
    Applying Lemma \eqref{lem: cons_decay} in \eqref{ineq: Z_cons_gt} with $\tau_k = \frac{\alpha_k^p}{n}$, we have
    \begin{equation}\label{ineq: z_u_cons_gt}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right]\leq 2\varrho(m)\sum_{k=0}^{K}\frac{\alpha_k^{p+2}}{n}\E\left[\|\mU_k - \bar \mU_k\|^2\right].
    \end{equation}
    The above two inequalities together with \eqref{ineq: U_cons_gt} and \eqref{ineq: alpha_m_condition_gt} imply
    \begin{align*}
        &\sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq  2\varrho(m)\gamma^2\sum_{k=0}^{K}\frac{\alpha_k^{p+2}}{n}\E\left[\|\mU_k - \bar \mU_k\|^2\right] \\
        \leq& \sum_{k=0}^{K}\left\{40L_{\nabla F}^2\gamma^2\varrho(m)^2\alpha_k^2\right\}\frac{\alpha_k^{p}}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + 20 \gamma^2 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\}\\
        \leq&\frac{1}{2}\sum_{k=0}^{K}\frac{\alpha_k^{p}}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + 20 \gamma^2 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\},
    \end{align*}
    which gives
    \begin{equation}\label{ineq: X_cons_final_gt}
        \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq 40 \gamma^2 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\}.
    \end{equation}
    Combining \eqref{ineq: alpha_m_condition_gt}, \eqref{ineq: U_cons_gt}, \eqref{ineq: z_u_cons_gt}, and \eqref{ineq: X_cons_final_gt}, we obtain that
    \begin{align*}
        &\sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right] \leq 2\varrho(m)\sum_{k=0}^{K}\frac{\alpha_k^{p+2}}{n}\E\left[\|\mU_k - \bar \mU_k\|^2\right] \notag\\
        \leq&\frac{1}{2\gamma^2}\sum_{k=0}^{K}\frac{\alpha_k^{p}}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right] + 20 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\},\\
        \leq &40 \varrho(m)^2 \sum_{k=0}^{K}\alpha_k^{p+2}\left\{L_{\nabla F}^2 \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] +2\sigma^2\right\}.
    \end{align*}
    % Combining \eqref{ineq: Y_cons_gt}, \eqref{ineq: X_cons_final_gt}, and \eqref{ineq: Z_cons_final_gt}, we have
    % \begin{equation}\label{ineq: Y_cons_final_gt}
    %     \sum_{k=0}^{K}\frac{\alpha_k^p}{n}\E\left[\|\mY_k - \bar \mY^k\|^2\right]\leq \frac{24\gamma^2}{n}\left\{L_{\nabla F}^2\sum_{k=0}^{K}\alpha_k^{p+4}\E\left[\|\bar x^k - \bar y^k\|^2\right] + 2\sigma^2\sum_{k=0}^{K}\alpha_k^{p+2}\right\}.
    % \end{equation}
    % Besides, we know
    % \begin{equation}\label{ineq: y_bar_y_plus_final_gt}
    %     \begin{aligned}
    %         &\sum_{k=0}^{K} \alpha_k^p\E\left[\|\bar y^k - y_+^k\|^2\right] = \sum_{k=0}^{K}\alpha_k^p \E\left[\|\avein\left(y_i^k - y_+^k\right)\|^2\right]\leq \sum_{k=0}^{K} \frac{\alpha_k^p}{n}\sum_{i=1}^{n}\E\left[\|y_i^k - y_+^k\|^2\right] \\
    %     \leq &\sum_{k=0}^{K}\frac{2\alpha_k^p}{n} \E\left[\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2\right]\leq \frac{24\gamma^2}{n}\left\{L_{\nabla F}^2\sum_{k=0}^{K}\alpha_k^{p+4}\E\left[\|\bar x^k - \bar y^k\|^2\right] + 2\sigma^2\sum_{k=0}^{K}\alpha_k^{p+2}\right\}.
    %     \end{aligned}
    % \end{equation}
\end{proof}
% {\bf Remark:} Note that a sufficient condition to ensure \eqref{ineq: alpha_m_condition_gt} is
% \[
%     \alpha_k \leq \min\left\{\frac{1}{4\rho^m} - \frac{\rho^m}{4}, \frac{(1-\rho^{2m})^2}{18L_{\nabla F}\gamma \rho^{2m}}\right\}
% \]

\begin{lemma}[Basic inequalities of dual convergence]\label{lem: z_nabla_F}
\begin{equation}\label{def: delta_r}
    \begin{split}
        \delta^{k} &= \frac{\nabla F(\bar x^k) - \nabla F(\bar x^{k+1})}{\alpha_k} +  
        \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \nabla F(\bar x^k),\quad \bar\Delta^{k+1} = \bar v^{k+1} - \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i).
    \end{split}
\end{equation}
    Under Assumption \ref{aspt:lipschitz-gradient}, we have
    \begin{equation}\label{ineq:zbar-mse-recursive}
    \begin{split}
        \|\bar z^{k+1} - & \nabla F(\bar x^{k+1})\|^2 \leq (1-\alpha_k) \norm{\bar z^k - \nabla F(\bar x^k)}^2 +  2L_{\nabla F}^2 \alpha_k\norm{\bar x^k -  \bar y^k}^2  + \alpha_k^2 \norm{\bar\Delta^{k+1}}^2\\
        &+ \frac{2L_{\nabla F}^2\alpha_k}{n}\norm{\mX_k - \bar \mX_k}^2  + 2\< \alpha_k \bar\Delta^{k+1}, (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}>,
    \end{split}
    \end{equation}
    and
    \begin{equation}\label{ineq:zbar-diff}
    \begin{split}
        \norm{\bar z^{k+1} - \bar z^{k}}^2 & \leq  \alpha_k^2 \bigg\{2\norm{\nabla F(\bar x^k) - \bar z^k}^2 +  \frac{2 L_{\nabla F}^2}{n}\norm{\mX_k - \bar \mX_k}^2 + \norm{\bar \Delta^{k+1}}^2 \\
        &\hspace{16em} + 2\<\bar \Delta^{k+1}, \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \bar z^k>\bigg\}.
    \end{split}
    \end{equation}
\end{lemma}
\begin{proof}
    By definitions in \eqref{def: delta_r}, we have
    \begin{equation*}
        \bar z^{k+1} - \nabla F(\bar x^{k+1}) = (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}  + \alpha_k \bar\Delta^{k+1},
    \end{equation*}
    Hence, we can get
    \begin{align*}
        &\norm{\bar z^{k+1} - \nabla F(\bar x^{k+1})}^2 \\
        &= \norm{(1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}}^2  + \alpha_k^2 \norm{\bar\Delta^{k+1}}^2 + 2\< \alpha_k \bar\Delta^{k+1}, (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}>\\
        &\leq (1-\alpha_k) \norm{\bar z^k - \nabla F(\bar x^k)}^2 + \alpha_k \norm{\delta^{k}}^2 + \alpha_k^2 \norm{\bar\Delta^{k+1}}^2 + 2\< \alpha_k \bar\Delta^{k+1}, (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}>
    \end{align*}
    where the inequality uses the convexity of $\|\cdot\|^2$. In addition, we have
    \begin{align*}
        \norm{\delta^{k}}^2 &\leq 2\norm{\frac{\nabla F(\bar x^k) - \nabla F(\bar x^{k+1})}{\alpha_k}}^2 + 2\norm{\avein\left(\nabla F_i(x_i^k) - \nabla F_i(\bar x^k)\right)}^2 \\
        &\leq 2L_{\nabla F}^2 \norm{\bar x^k - \bar y^k}^2 + \frac{2L_{\nabla F}^2}{n}\norm{\mX_k - \bar \mX_k}^2, \\
    \end{align*}
    which completes the proof of \eqref{ineq:zbar-mse-recursive}. The inequality \eqref{ineq:zbar-diff} can be proved similarly by noting that
    \begin{align}
        & \norm{\bar z^{k+1} - \bar z^k}^2  = \alpha_k^2 \norm{-\bar z^k + \bar v^{k+1}}^2\notag \\
        &= \alpha_k^2 \norm{(\nabla F(\bar x^k) - \bar z^k) + \left(\frac{1}{n}\sum_{i=1}^{n}\left(\nabla F_i(x^k_i) - \nabla F_i(\bar x^k)\right)\right) + \alpha_k \bar \Delta^{k+1}}^2 \notag\\
        &= \alpha_k^2 \bigg\{\norm{(\nabla F(\bar x^k) - \bar z^k) + \left(\frac{1}{n}\sum_{i=1}^{n}\left(\nabla F_i(x^k_i) - \nabla F_i(\bar x^k)\right)\right)}^2  + \norm{\bar \Delta^{k+1}}^2 + 2\<\bar \Delta^{k+1}, \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \bar z^k>\bigg\}.\notag
    \end{align}
\end{proof}


\begin{lemma}[]\label{lem: psi_yplus_ybar} Under Assumption \ref{aspt:Psi},
\begin{equation}\label{ineq: delta_psi_2}  
    \Psi(\bar y^k)  - \Psi(y_+^k)\leq \< \bar z^k + \gamma^{-1}(\bar y^k - \bar x^k), y^k_+ - \bar y^k  > + \frac{\gamma}{2n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{\gamma^{-1}}{2n}\norm{\mX_k - \bar \mX_k }^2. 
\end{equation}
\end{lemma}
\begin{proof}
By the convexity of $\Psi$ and part (b) of Lemma \ref{lem:eta-smooth}, we have
\begin{align}
    \Psi(\bar y^k) & - \Psi(y_+^k) \overset{\text{cvx}}{\leq} \frac{1}{n}\sum_{i=1}^{n} \left(\Psi(y_i^k) - \Psi(y^k_+)\right) \overset{\text{Lemma \ref{lem:eta-smooth} (b)}}{\leq} \frac{1}{n}\sum_{i=1}^{n}\< z_i^k + \gamma^{-1} (y_i^k - x_i^k) , y^k_+ - y_i^k >\notag\\
    &= \< \bar z^k + \gamma^{-1}(\bar y^k - \bar x^k), y^k_+ - \bar y^k  > + \frac{1}{n}\sum_{i=1}^{n} \< z_i^k - \bar z^k + \gamma^{-1} (y_i^k -\bar y^k + \bar x^k - x_i^k)  ,  \bar y^k - y_i^k>\notag\\
    &\leq \< \bar z^k + \gamma^{-1}(\bar y^k - \bar x^k), y^k_+ - \bar y^k  > + \frac{\gamma}{2n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{1}{2n\gamma}\norm{\mX_k - \bar \mX_k}^2. \notag
\end{align}
The equality above comes from the fact that for sequences $\{a_i\}_{1\leq i\leq n}, \{b_i\}_{1\leq i\leq n} \in \realset^d$, we have
$$\sum_{i=1}^{n} \<a_i - \frac{1}{n}\sum_{i=1}^{n}a_i,b_i - \frac{1}{n} \sum_{i=1}^{n}b_i> = \sum_{i=1}^{n} \<a_i, b_i> - \left(\frac{1}{n}\sum_{i=1}^{n}a_i\right)\left( \frac{1}{n}\sum_{i=1}^{n}b_i\right).$$ 
The last inequality above is obtained by Young's inequalities:
\begin{align}
    \< z^k_i - \bar z^k, \bar y^k - y^k_i> &\leq  \frac{\gamma}{2} \norm{z^k_i - \bar z^k}^2 + \frac{1}{2\gamma} \norm{y^k_i - \bar y^k}^2, \notag\\
    \gamma^{-1}\< \bar x^k - x^k_i, \bar y^k - y^k_i> &\leq \frac{1}{2\gamma}\norm{x^k_i - \bar x^k}^2  + \frac{1}{2\gamma} \norm{y^k_i - \bar y^k}^2.\notag
\end{align}
\end{proof}



\begin{lemma}[Basic lemma of merit function difference]\label{lem: main-merit-func-diff}
Let $W(\bar x^k, \bar z^k)$ be the merit function defined in \eqref{def: merit_fun} with $\lambda = \frac{\gamma^{-1}}{8 L_{\nabla F}^2}$. Under Assumption \ref{aspt:lipschitz-gradient}, \ref{aspt:Psi}, for any $k \geq 0$, setting  $\alpha_k \leq \min\{\frac{\gamma^{-1}}{8 L_{\nabla F}}, \frac{\gamma^{-1}}{8 C_\gamma}, \frac{\gamma^{-1}}{32C_\gamma L_{\nabla F}^2}\}$,  we have
\begin{equation*}
     W(\bar x^{k+1}, \bar z^{k+1}) - W(\bar x^{k}, \bar z^{k}) \leq - \alpha_k \left\{\Theta^k  +  \Upsilon^k + \alpha_k \Lambda^k +  r^{k+1} \right\},
\end{equation*}
where
\begin{equation}\label{def:Theta-Lambda-Upsilon-r}
\begin{split}
    \Theta^k = & \left\{\frac{\gamma^{-1}}{4} \|\bar x^k  - \bar y^k\|^2 + \frac{\lambda}{4} \norm{\nabla F(\bar x^k) - \bar z^k}^2 \right\},  \quad \Lambda^k = \left\{\frac{C_\gamma + 2\lambda}{2}\norm{\bar \Delta^{k+1}}^2\right\},\\
    \Upsilon^k = & \left\{\frac{2\gamma(1 + 4\gamma^2L_{\nabla F}^2)}{n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{2\left(\gamma^{-1} + 3\gamma L_{\nabla F}^2 \right)}{n}\norm{\mX_k - \bar \mX_k}^2\right\},\\
    r^{k+1}  = & \< \bar \Delta^{k+1},  \bar x^k - y^k_+ + C_\gamma \alpha_k\left(\frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \bar z^k\right) + 2\lambda \left( (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}\right)>.
\end{split}
\end{equation}
\end{lemma}
\begin{proof}
    By the smoothness of $F$ and $\eta$, we have
\begin{align}
    &F(\bar x^{k+1}) - F(\bar x^k) \notag\\
    \leq &\<\nabla F(\bar x^k), \bar x^{k+1} - \bar x^k> + \frac{L_{\nabla F}}{2}\|\bar x^{k+1} - \bar x^k\|^2 = -\alpha_k\<\nabla F(\bar x^k), \bar x^k - \bar y^k> + \frac{L_{\nabla F}\alpha_k^2}{2}\|\bar x^k - \bar y^k\|^2\label{ineq: delta_f}\\
    &\eta(\bar x^k, \bar z^k) - \eta(\bar x^{k+1}, \bar z^{k+1})\notag\\
    \leq & \<-\bar z^k - \gamma^{-1} (y^k_+ - \bar x^k), \bar x^k - \bar x^{k+1}> + \<y^k_+ - \bar x^k, \bar z^k - \bar z^{k+1}> + \frac{C_\gamma}{2}\left(\|\bar x^{k+1}- \bar x^k\|^2 + \|\bar z^{k+1} - \bar z^k\|^2\right) \notag\\
    = &2\alpha_k\<\bar z^k, y^k_+-\bar x^k> + \gamma^{-1}\alpha_k\|\bar x^k - y^k_+\|^2 + \alpha_k\<\bar v^{k+1}, \bar x^k - \bar y^k>  \notag \\
    & \quad + \alpha_k \<\bar z^k + \gamma^{-1}(y^k_+-\bar x^k) + \bar v^{k+1} , \bar y^k - y^k_+>  + \frac{C_\gamma}{2}\left(\alpha_k^2\|\bar x^k-\bar y^k\|^2 + \|\bar z^{k+1} - \bar z^k\|^2\right).\label{ineq: delta_eta_1}
\end{align}
Since $y^{k}_{+}$ is the minimizer of a $1/\gamma$-strongly convex function, i.e.,
\begin{equation*}
    \<\bar z^k, y^k_+ - \bar x^k> + \frac{1}{2\gamma}\|y^k_+ - \bar x^k\|^2 +\Psi(y^k_+) \leq \Psi(\bar x^k)  - \frac{1}{2\gamma}\|y^k_+ - \bar x^k\|^2,
\end{equation*}
which together with \eqref{ineq: delta_eta_1} gives
\begin{align}
    &\eta(\bar x^k, \bar z^k) - \eta(\bar x^{k+1},\bar z^{k+1}) \notag \\
    \leq &-\gamma^{-1}\alpha_k\|\bar x^k - y^k_+\|^2 + \alpha_k\<\bar v^{k+1}, \bar x^k - \bar y^k> + \alpha_k\< \bar z_k + \gamma^{-1}(y^k_+-\bar x^k) + \bar v^{k+1}, \bar y^k - y^k_+> \notag\\
    & + 2\alpha_k \left(\Psi(\bar x^k) - \Psi(y^k_+)\right) +\frac{C_\gamma}{2}\left(\|\bar x^{k+1}-\bar x^k\|^2 + \|\bar z^{k+1} - \bar z^k\|^2\right). \label{ineq: delta_eta_2}
\end{align}
By the convexity of $\Psi$, we have
\begin{equation}\label{ineq: delta_psi_1}
    \Psi(\bar x^{k+1}) - \Psi(\bar x^k) \leq (1-\alpha_k) \Psi(\bar x^k) + \alpha_k \Psi(\bar y^k) - \Psi(\bar x^k) = \alpha_k \left( \Psi(\bar y^k) - \Psi(\bar x^k_i) \right).
\end{equation}
Combining \eqref{ineq: delta_f}, \eqref{ineq: delta_eta_2}, and \eqref{ineq: delta_psi_1}, we have
\begin{equation}\label{ineq: merit_diff_1}
    \begin{aligned}
        &\left[\Phi(\bar x^{k+1}) + \Psi(\bar x^{k+1}) - \eta(\bar x^{k+1}, \bar z^{k+1})\right] - \left[\Phi(\bar x^{k}) + \Psi(\bar x^k) - \eta(\bar x^{k}, \bar z^{k})\right]  \\
    \leq &- \gamma^{-1}\alpha_k\|\bar x^k - y^k_{+}\|^2  + \alpha_k \< \bar v^{k+1}-\nabla F(\bar x^k), \bar x^k - \bar y^k> + 2\alpha_k(\Psi(\bar y^k) - \Psi(y_+^k))\\
    & + \alpha_k\< \bar z^k + \gamma^{-1}(y^k_+ -\bar x^k) + 
    \bar v^{k+1}, \bar y^k - y^k_+> +\frac{(L_{\nabla F}+C_\gamma)\alpha_k^2}{2}\|\bar x^k - \bar y^k\|^2 + \frac{C_{\gamma}}{2} \|\bar z^{k+1} - \bar z^k\|^2.
    \end{aligned}
\end{equation}
Removing non-smooth terms in \eqref{ineq: merit_diff_1} using \eqref{ineq: delta_psi_2} in Lemma \ref{lem: psi_yplus_ybar}, and re-organizing \eqref{ineq: merit_diff_1} using the decomposition that $\bar z^{k+1} - \bar z^k = \alpha_k(-\bar z^k + \bar v^{k+1}) = \alpha_k (\nabla F(\bar x^k) - \bar z^k) + \alpha_k (\frac{1}{n}\sum_{i=1}^{n}(\nabla F_i(x^k_i) - \nabla F_i(\bar x^k))) + \alpha_k \bar \Delta^{k+1}$, we can get
\begin{align}
     &\left[\Phi(\bar x^{k+1}) + \Psi(\bar x^{k+1}) - \eta(\bar x^{k+1}, \bar z^{k+1})\right] - \left[\Phi(\bar x^{k}) + \Psi(\bar x^k) - \eta(\bar x^{k}, \bar z^{k})\right]  \notag\\
    \leq & \underbrace{\gamma^{-1}\alpha_k\bigg\{-\|\bar x^k  - y^k_{+}\|^2  + \< (y^k_+ -\bar y^k) + (\bar x^k - \bar y^k) , \bar y^k - y^k_+> \bigg\}}_{\varkappa_1}  \notag \\
    & + \underbrace{\alpha_k \<  \frac{1}{n}\sum_{i=1}^{n} \left( \nabla F_i (x^k_i) - \nabla F_i(\bar x^k) \right), \bar x^k - y^k_+ >}_{\varkappa_2} + \underbrace{\alpha_k\< \nabla F(\bar x^k) - \bar z^k,   \bar y^k - y^k_+>}_{\varkappa_3} +  \alpha_k \< \bar \Delta^{k+1},  \bar x^k - y^k_+>  \notag\\
    & \frac{(L_{\nabla F}+C_\gamma)\alpha_k^2}{2}\|\bar x^k - \bar y^k\|^2 + \underbrace{\frac{C_{\gamma}}{2} \|\bar z^{k+1} - \bar z^k\|^2}_{\varkappa_4} + \frac{\gamma\alpha_k}{n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{\gamma^{-1}\alpha_k}{n}\norm{\mX_k - \bar \mX_k}^2.\notag
\end{align}
To further simplify the above inequalities, we analyze the terms $\varkappa_1, \varkappa_2, \varkappa_3, \varkappa_4$ separately as follows:
\begin{align}
\varkappa_1 = & \gamma^{-1}\alpha_k \left\{- \norm{\bar x^k - \bar y^k}^2 - \< \bar x^k - \bar y^k, \bar y^k - y^k_+ > - 2 \norm{\bar y^k - y^k_{+}}^2\right\} \leq -\frac{7\gamma^{-1}\alpha_k}{8} \norm{\bar x^k - \bar y^k}^2,\notag\\
\varkappa_2 \leq &\, 2\gamma\alpha_k\norm{\frac{1}{n}\sum_{i=1}^{n} \left( \nabla F_i (x^k_i) - \nabla F_i(\bar x^k) \right)}^2 + \frac{\gamma^{-1}\alpha_k}{8} \norm{ \bar x^k - y^k_+}^2\notag\\
\leq &\, \frac{2\gamma \alpha_k L_{\nabla F}^2}{n} \norm{\mX_k - \bar \mX_k}^2+ \frac{\gamma^{-1}\alpha_k}{4} \norm{ \bar x^k - \bar y^k}^2 + \frac{\gamma^{-1}\alpha_k}{4} \norm{ \bar y^k - y^k_+}^2\notag,\\
\varkappa_3 \leq &\, \frac{\lambda \alpha_k}{2} \norm{\nabla F(\bar x^k) - \bar z^k}^2 + \frac{\lambda^{-1}\alpha_k}{2} \norm{\bar y^k - y^k_+}^2,   \notag\\ 
\varkappa_4 \leq &\, \frac{C_\gamma \alpha_k^2}{2} \left\{2\norm{\nabla F(\bar x^k) - \bar z^k}^2 +  \frac{2 L_{\nabla F}^2}{n}\norm{\mX_k - \bar \mX_k}^2 + \norm{\bar \Delta^{k+1}}^2 + 2\<\bar \Delta^{k+1}, \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \bar z^k> \right\}. \notag
\end{align}
Combining the above results with \eqref{ineq:zbar-mse-recursive} in Lemma \ref{lem: z_nabla_F} and the definition of $W(\bar x^k, \bar z^k)$ in \eqref{def: merit_fun}, we have
\begin{align}
     &W(\bar x^{k+1}, \bar z^{k+1}) - W(\bar x^{k}, \bar z^{k}) \leq  \alpha_k\left\{- \frac{5}{8}\gamma^{-1} + \frac{(L_{\nabla F} + C_\gamma)\alpha_k}{2} + 2\lambda L_{\nabla F}^2 \right\}\|\bar x^k  - \bar y^k\|^2 \notag \\
     &+ \alpha_k\left\{ - \frac{\lambda}{2} + C_\gamma \alpha_k \right\}\norm{\nabla F(\bar x^k) - \bar z^k}^2 + \frac{C_\gamma \alpha_k^2}{2}\norm{\bar \Delta^{k+1}}^2 +  \frac{(\gamma^{-1}  + 2\lambda^{-1})\alpha_k}{4} \norm{y^k_+ - \bar y^k}^2 \notag \\
    & + \frac{\gamma\alpha_k}{n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{\left(\gamma^{-1}+ 2\gamma L_{\nabla F}^2 + 2\lambda L_{\nabla F}^2 + C_\gamma L_{\nabla F}^2\alpha_k \right)\alpha_k}{n}\norm{\mX_k - \bar \mX_k}^2 \notag\\
     & +  \alpha_k \underbrace{\< \bar \Delta^{k+1},  \bar x^k - y^k_+ + C_\gamma \alpha_k\left(\frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) - \bar z^k\right) + 2\lambda \left( (1-\alpha_k) \left(\bar z^k - \nabla F(\bar x^k)\right) + \alpha_k\delta^{k}\right)>}_{r^{k+1}}. \label{ineq: merit_diff_2}
\end{align}
In addition, from Lemma \ref{lem: consensus_y}, we already know
\begin{equation*}
    \norm{y^k_+ - \bar y^k}^2  \leq \frac{2}{n} \left\{\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2 \right\}.
\end{equation*}
Finally, choosing $\alpha_k$ such that $ \alpha_k \leq \min\{\frac{\gamma^{-1}}{8 L_{\nabla F}}, \frac{\gamma^{-1}}{8 C_\gamma}, \frac{\gamma^{-1}}{32C_\gamma L_{\nabla F}^2}\}$ and $\lambda = \frac{\gamma^{-1}}{8 L_{\nabla F}^2}$, we can re-organize the terms in \eqref{ineq: merit_diff_2} as follows and complete the proof.
\begin{align}
     &W(\bar x^{k+1}, \bar z^{k+1}) - W(\bar x^{k}, \bar z^{k})  \notag\\
    \leq & - \alpha_k \underbrace{\left\{\frac{\gamma^{-1}}{4} \|\bar x^k  - \bar y^k\|^2 + \frac{\lambda}{4} \norm{\nabla F(\bar x^k) - \bar z^k}^2 \right\}}_{\Theta^k} + \alpha_k^2 \underbrace{\left\{\frac{C_\gamma + 2\lambda}{2}\norm{\bar \Delta^{k+1}}^2\right\}}_{\Lambda^k}\notag + \alpha_k r^k \\
    & + \alpha_k \underbrace{\left\{\frac{2\gamma(1 + 4\gamma^2L_{\nabla F}^2)}{n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{2\left(\gamma^{-1} + 3\gamma L_{\nabla F}^2 \right)}{n}\norm{\mX_k - \bar \mX_k}^2\right\}}_{\Upsilon^k}. \notag \\
\end{align}
\end{proof}


\section{Discussion on the Defintion of Consensus Error}\label{sec: conserror}
In this section, we briefly discuss two different functions that measure the consensus violation of vectors among agents. Suppose agent $i$ has $x_i\in\realset^d$, our consensus error can be viewed as
\[
    f(x_1, ..., x_n) = \frac{1}{n}\sum_{i=1}^{n}\|x_i - \bar x\|^2,
\]
where $\bar x \coloneqq \frac{1}{n}\sum_{i=1}^{n}x_i$, while \texttt{SPPDM} in \cite{wang2021distributed} defines (see Eq. (4a), (4b), (5a), (5b), and (41) in \cite{wang2021distributed})
\begin{equation}\label{eq: sppdm_cons}
    \begin{aligned}
        g_W(x_1, ..., x_n) &= \sum_{i\sim j, 1\leq i< j\leq n}\|x_i - x_j\|^2  \\
    &=\frac{1}{2}\sum_{i= j \text{ or } i\sim j}\left(\|x_i-\bar x\|^2 + \|x_j-\bar x\|^2 - 2\<x_i-\bar x, x_j-\bar x>\right) 
    \end{aligned}
\end{equation}
over a connected network whose weighted adjacency matrix (i.e., mixing matrix) is $W$, and the stationarity therein is defined by using $g_W$. $i\sim j$ means agents $i$ and $j$ are neighbors. Note that in general the relationship between $f$ and $g_W$ largely depends on $W$. We consider several special cases:
\begin{itemize}
    \item W is a complete graph. By \eqref{eq: sppdm_cons} we have
    \begin{align*}
        g_W(x_1,...,x_n) &= n\sum_{i=1}^{n}\|x_i - \bar x\|^2 - \<\sum_{i=1}^{n}\left(x_i - \bar x\right), \sum_{j=1}^{n}\left(x_j - \bar x\right)> = n^2 f(x_1,...,x_n).
    \end{align*}
    \item W is a cycle. By \eqref{eq: sppdm_cons} we have
    \[
        g_W(x_1,...,x_n) \leq \sum_{i\sim j, 1\leq i<j\leq n}2\left(\|x_i - \bar x\|^2 + \|x_j - \bar x\|^2\right) = 4nf(x_1,...,x_n).
    \]
    \item W is a simple path such that $i$ and $i+1$ are adjacent for all $1\leq i\leq n-1$, and $x_i = i\in \realset$. Note that in this case, we can directly obtain $g_W(x_1,...,x_n) = n - 1$. For $f$ we have
    \[
        f(x_1,...,x_n) = \frac{1}{n}\sum_{i=1}^{n}\left(\frac{n+1}{2} - i\right)^2 = \Theta(n^2),
    \]
    which implies $g_W = \Theta(\frac{f}{n})$.
\end{itemize}
We know from the above examples that the order (in terms of $n$) of $g_W / f$ can range from $\frac{1}{n}$ to $n^2$. Hence these two types of consensus error are not comparable if no additional assumptions are given, and thus we only include \texttt{SPPDM} in the experiments and do not compare their complexity results to ours.




\bibliography{xiao_524-supp}

\end{document}
