% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage[ruled,vlined]{algorithm2e} %linesnumbered
\usepackage{amssymb}
\usepackage{amsmath, amsthm, amsfonts, mathrsfs}
\usepackage{bbm}
\usepackage{bm}
\usepackage{color, colortbl}
\definecolor{LightCyan}{rgb}{0.88,1,1}
\usepackage{dirtytalk}
\usepackage{enumerate}
\usepackage{subfigure}
\usepackage{url}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{makecell}
\usepackage{multirow, array}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}[section]
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newenvironment{hproof}{%
  \renewcommand{\proofname}{Proof Sketch}\proof}{\endproof}

\theoremstyle{remark}
\newtheorem*{remark}{Remark}


\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\mA}{\mathbf{A}}
\newcommand{\mB}{\mathbf{B}}
\newcommand{\mG}{\mathbf{G}}
\newcommand{\mI}{\mathbf{I}}
\newcommand{\mJ}{\mathbf{J}}
\newcommand{\mV}{\mathbf{V}}
\newcommand{\mW}{\mathbf{W}}
\newcommand{\mX}{\mathbf{X}}
\newcommand{\mY}{\mathbf{Y}}
\newcommand{\mZ}{\mathbf{Z}}
\newcommand{\mU}{\mathbf{U}}
\newcommand{\mF}{\mathbf{F}}

\def\setF{\mathscr{F}} % Set F

\newcommand{\integerset}{\mathbb{Z}}
\newcommand{\naturalset}{\mathbb{N}}
\newcommand{\realset}{\mathbb{R}}

\newcommand{\diag}[1]{\mathrm{diag}\left(#1\right)}
\newcommand{\domain}[1]{\mathrm{dom}\left(#1\right)}
\newcommand{\range}[1]{\mathrm{rng}\left[#1\right]}

\newcommand{\E}{\mathbb{E}}
\newcommand{\Et}[1]{\mathbb{E}_t \left[#1\right]}
\newcommand{\prob}[1]{\mathbb{P} \left(#1\right)}
\newcommand{\condprob}[2]{\mathbb{P} \left(#1 \,\middle|\, #2\right)}
\newcommand{\probt}[1]{\mathbb{P}_t \left(#1\right)}
\newcommand{\var}[1]{\mathrm{var} \left[#1\right]}
\newcommand{\condvar}[2]{\mathrm{var} \left[#1 \,\middle|\, #2\right]}
\newcommand{\std}[1]{\mathrm{std} \left[#1\right]}
\newcommand{\condstd}[2]{\mathrm{std} \left[#1 \,\middle|\, #2\right]}
\newcommand{\cov}[1]{\mathrm{cov} \left[#1\right]}
\newcommand{\condcov}[2]{\mathrm{cov} \left[#1 \,\middle|\, #2\right]}

\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\ceils}[1]{\left\lceil#1\right\rceil}
\newcommand*\dif{\mathop{}\!\mathrm{d}}
\newcommand{\floors}[1]{\left\lfloor#1\right\rfloor}
\newcommand{\I}[1]{\mathds{1} \! \left\{#1\right\}}
\newcommand{\maxnorm}[1]{\|#1\|_\infty}
\newcommand{\negpart}[1]{\left[#1\right]^-}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\normw}[2]{\|#1\|_{#2}}
\newcommand{\pospart}[1]{\left[#1\right]^+}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\subreal}[0]{\preceq}
\newcommand{\supreal}[0]{\succeq}
\newcommand{\T}{^\top}
\newcommand{\avein}{\frac{1}{n}\sum_{i=1}^n}
\newcommand{\avejn}{\frac{1}{n}\sum_{j=1}^n}
\newcommand{\bfone}{\mathbf{1}}
\newcommand{\bfonet}{\mathbf{1}^{\top}}
\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}

\DeclareMathOperator*{\argmax}{arg\,max\,}
\DeclareMathOperator*{\argmin}{arg\,min\,}
\let\det\relax
\DeclareMathOperator{\det}{det}
\DeclareMathOperator{\dom}{dom}
\DeclareMathOperator{\poly}{poly}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\sgn}{sgn}
\DeclareMathOperator{\prox}{\mathbf{prox}}
\DeclareMathOperator{\proj}{\Pi}
\let\trace\relax
\DeclareMathOperator{\trace}{tr}
\def\<#1,#2>{\left\langle #1,#2\right\rangle}
\mathchardef\mhyphen="2D
\allowdisplaybreaks


\title{A One-Sample Decentralized Proximal Algorithm \\for Non-Convex Stochastic Composite Optimization}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<texiao@ucdavis.edu>?Subject=Your UAI 2023 paper}
{Tesi Xiao}{}}
\author[2]{\href{mailto:<xuxchen@ucdavis.edu>?Subject=Your UAI 2023 paper}
{Xuxing Chen}{}}
\author[1]{Krishnakumar Balasubramanian}
\author[3]{Saeed Ghadimi}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of California, Davis
    % Davis, California, USA
}
\affil[2]{%
    Department of Mathematics\\
    University of California, Davis
    % Davis, California, USA
}
\affil[3]{%
    Department of Management Sciences\\
    University of Waterloo
}
  
\begin{document}
\maketitle

\begin{abstract}
We focus on decentralized stochastic non-convex optimization, where $n$ agents work together to optimize a composite objective function which is a sum of a smooth term and a non-smooth convex term. To solve this problem, we propose two single-time scale algorithms: \texttt{Prox-DASA} and \texttt{Prox-DASA-GT}. These algorithms can find $\epsilon$-stationary points in $\cO(n^{-1}\epsilon^{-2})$ iterations using constant batch sizes (i.e., $\mathcal{O}(1)$). Unlike prior work, our algorithms achieve a comparable complexity result without requiring large batch sizes, more complex per-iteration operations (such as double loops), or stronger assumptions. Our theoretical findings are supported by extensive numerical experiments, which demonstrate the superiority of our algorithms over previous approaches. Our code is available at \url{https://github.com/xuxingc/ProxDASA}.
\end{abstract}

\section{Introduction}
\label{sec:setting}
%Decentralized optimization %has attracted increasing attention thanks to its . It 
%has been proven theoretically and empirically to be superior over centralized optimization %that training over a decentralized network can achieve linear speed-up, 
%as it can help mitigate the low network bandwidth, and preserve data privacy compared %with centralized methods 
%\citep{yuan2016convergence, lian2017can}. 

Decentralized optimization is a flexible paradigm for solving complex optimization problems in a distributed manner and has numerous applications in fields such as machine learning, robotics, and control systems. It has attracted increased attention due to the following benefits: (i) \textit{Robustness}: Decentralized optimization is more robust than centralized optimization because each agent can operate independently, making the system more resilient to failures compared to a centralized system where a coordinator failure or overload can halt the entire system. (ii) \emph{Privacy}: Decentralized optimization can provide greater privacy because each agent only has access to a limited subset of observations, which may help to protect sensitive information. (iii) \emph{Scalability}: Decentralized optimization is highly scalable as it can handle large datasets in a distributed manner, thereby solving complex optimization problems that are difficult or even impossible to solve in a centralized setting.

Specifically, we consider the following decentralized composite optimization problems in which $n$ agents collaborate to solve
\begin{equation}\label{eq:problem}
    \underset{x\in\realset^d}{\min}~ \Phi(x) \coloneqq  F(x) + \Psi(x), \ F(x) \coloneqq \frac{1}{n} \sum_{i=1}^{n} F_i(x),
\end{equation}
where each function $F_i(x)$ is a smooth function only known to the agent $i$; $\Psi(x)$ is non-smooth, convex, and shared across all agents; $\Phi(x)$ is bounded below by $\Phi_* > -\infty$. We consider the stochastic setting where the exact function values and derivatives of $F_{i}$'s are unavailable. In particular, we assume that $F_i(x)= \E_{\xi_i\sim \cD_i}[G_i(x, \xi_i)]$, where $\xi_i$ is a random vector and $\cD_i$ is the distribution used to generate samples for agent $i$. The agents form a connected and undirected network and can communicate with their neighbors to cooperatively solve \eqref{eq:problem}. The communication network can be represented with $\mathbb{G} = (\mathcal{V}, \mW)$ where $\mathcal{V} = \{v_1, v_2, \dots, v_n\}$ denotes all devices and $\mW = [w_{ij}]\in \realset^{n\times n}$ is the weighted adjacency matrix indicating how two agents are connected. 

%The presence of a non-smooth regularizer generalizes the decentralized smooth optimization problem, leading to many applications. For example, 
%Problem \eqref{eq:problem} can be used, for example, to perform decentralized training with fairness constraints with respect to certain pre-defined sensitive features \citep{donini2018empirical} by setting $\Psi$ to be an indicator function of a convex compact set. In additon, it can also be used to train \emph{sparse} deep neural networks that use non-smooth $L_0$ and $L_1$ regularizers on the weights to compressed models for deployment  on memory-constrained devices; see \citep{louizos2017learning, wen2016learning} for instance.
%This has led to the use of non-smooth $L_0$ and $L_1$ regularizers for learning sparse networks;  %In these circumstances, decentralized optimization algorithms designed for smooth optimization problems may not be applicable, and it becomes necessary to consider proximal algorithms for handling optimization problems with non-smooth components.
%In this work, we focus on the non-convex stochastic setting, where agents receive a stream of data at a constant rate \citep{nokleby2018stochastic}. 

A majority of the existing decentralized stochastic algorithms for solving~\eqref{eq:problem}, require large batch sizes to achieve convergence. %This is undesirable in the streaming data setup where the data cannot be stored in memory for long and should be processed as soon as possible. Other stochastic proximal 
The few algorithms that operate with constant batch sizes mainly rely on complicated variance reduction techniques and require stronger assumptions to establish convergence results. 
%However, given that training large-scale deep neural networks typically involves highly non-convex optimization problems and limited computational resources, using large batch sizes and complicated optimizers may not be desirable. 
To the best of our knowledge, the question of whether it is possible to develop decentralized stochastic optimization algorithms to solve ~\eqref{eq:problem} without the above mentioned limitations, remains unresolved. 

To address this, we propose the two decentralized stochastic proximal algorithms, \texttt{Prox-DASA} and \texttt{Prox-DASA-GT}, for solving~\eqref{eq:problem} and make the following \textbf{contributions}:
\begin{itemize}[leftmargin=1em]
    \item We show that \texttt{Prox-DASA} is capable of achieving convergence in both homogenous and bounded heterogeneous settings while \texttt{Prox-DASA-GT} works for general decentralized heterogeneous problems.
    \item We show that both algorithms find an $\epsilon$-stationary point in $\cO(n^{-1}\epsilon^{-2})$ iterations using only $\cO(1)$ stochastic gradient samples per agent and $m$ communication rounds at each iteration, where $m$ can be any positive integer. A topology-independent transient time can be achieved by setting $m=\lceil\frac{1}{\sqrt{1-\rho}}\rceil$, where $\rho$ is the second-largest eigenvalue of the communication matrix.
    \item Through extensive experiments, we demonstrate the superiority of our algorithms over prior works. 
\end{itemize}
A summary of our results and comparison to prior work is provided in Table~\ref{tab:summary}.  

%\textbf{Decentralized Smooth Optimization.} With the rise of contemporary distributed computing, decentralized smooth optimization, which can be viewed as consensus-constrained optimization over a communication network, started to gain popularity a few decades ago\citep{weiss1999multiagent, nedic201010, boyd2011distributed}. Various works have been proposed to analyze its convergence rates \citep{shi2014linear, shi2015extra, xu2015augmented, nedic2017achieving, yuan2018exact}. \cite{lian2017can} proposes D-PSGD and formally validates the advantages of decentralized training, which leads the trend of studying the stochastic version of decentralized optimization \citep{tang2018d, zhang2019decentralized, xin2021improved, pu2021distributed}.
% EXTRA\citep{shi2015extra}, Aug-DGM\citep{xu2015augmented}, DIGing\citep{nedic2017achieving}, Exact-Diffusion\citep{yuan2018exact}. 
% distributed ADMM\citep{chang2014multi, shi2014linear} \kb{the above paragraph adds nothing to the paper :) I suggest moving it to the appendix}

\textbf{Related Works on Decentralized Composite Optimization.} Motivated by wide applications in constrained optimization \citep{lee2013distributed, margellos2017distributed} and non-smooth problems with a composite structure as \eqref{eq:problem}, arising in signal processing \citep{ling2010decentralized, mateos2010distributed, patterson2014distributed} and machine learning \citep{facchinei2015parallel, hong2017prox}, several works have studied the decentralized composite optimization problem in~\eqref{eq:problem}, a natural generalization of smooth optimization. For example, \cite{shi2015proximal, li2019decentralized, alghunaim2019linearly, ye2020decentralized, xu2021distributed, li2021decentralized, sun2022distributed, wu2022unifying}  studied~\eqref{eq:problem} in the convex setting. Furthermore, \cite{facchinei2015parallel, di2016next, hong2017prox, zeng2018nonconvex, scutari2019distributed} studied~\eqref{eq:problem} in the deterministic setting. 

Although there has been a lot of research investigating decentralized composite optimization, the stochastic non-convex setting, which is more broadly applicable, still lacks a full understanding. \cite{wang2021distributed} proposes \texttt{SPPDM}, which uses a proximal primal-dual approach to achieve $\cO(\epsilon^{-2})$ sample complexity. \texttt{ProxGT-SA} and \texttt{ProxGT-SR-O} \citep{xin2021stochastic} incorporate stochastic gradient tracking and multi-consensus update in proximal gradient methods and obtain $\cO(n^{-1}\epsilon^{-2})$ and $\cO(n^{-1}\epsilon^{-1.5})$ sample complexity respectively, where the latter further uses a \texttt{SARAH} type variance reduction method \citep{pham2020proxsarah, wang2019spiderboost}. A recent work \citep{mancino2022proximal} proposes \texttt{DEEPSTORM}, which leverages the momentum-based variance reduction technique and gradient tracking to obtain $\cO(n^{-1}\epsilon^{-1.5})$ and $\tilde{\cO}(\epsilon^{-1.5})$ sample complexity under different stepsize choices. Nevertheless, existing works either require stronger assumptions \citep{mancino2022proximal} or increasing batch sizes \citep{wang2021distributed, xin2021stochastic}. 



% \textbf{Decentralized Optimization}

% \begin{itemize}
%     \item \citep{bianchi2012convergence}: Deterministic + Nonconvex + Projected GD (asymptotically converge to a KKT point)
%     \item \citep{zeng2018nonconvex}: Deterministic + Nonconvex + Proximal GD
%     \item \citep{liu2022accelerated}: Deterministic + Convex + Accelerated + Dual Average
%     \item \citep{liu2022rate}: Deterministic + Non-convex + Dual Average
% \end{itemize}



\begin{table*}[t]
\centering
\renewcommand{\arraystretch}{2}
\caption{Comparison of decentralized proximal gradient based algorithms to find an $\epsilon$-stationary solution to stochastic composite optimization in the nonconvex setting. The sample complexity is defined as the number of required samples per agent to obtain an $\epsilon$-stationary point (see Definition \ref{def: stat&cons}). We omit a comparison with \texttt{SPPDM} \citep{wang2021distributed} as their definition of stationarity differs from ours; see Appendix for further discussions.} %\ref{sec: conserror}
\label{tab:summary}
\resizebox{\textwidth}{!}{%
\begin{tabular}{| c | c| c| c| c| c| c|}
%\toprule
\hline
 \textbf{Algorithm}   & \makecell{\bf Batch Size} & \makecell{\bf Sample \\ \bf Complexity } & \makecell{\bf Communication \\ \bf Complexity} & \makecell{\bf Linear \\ \bf Speedup?} & \bf Remark\\
%\midrule
\hline
 % \makecell{\texttt{SPPDM}\\ \citep{wang2021distributed}} &  $ \cO(\epsilon^{-1})$ &  $ \cO(n^{-1}\epsilon^{-2})$ &  $ \cO(n^{-1}\epsilon^{-1})$ & \cmark & \\
 %\hline
 \makecell{\texttt{ProxGT-SA}\\ \citep{xin2021stochastic}} &  $\cO(\epsilon^{-1})$ & $\cO(n^{-1}\epsilon^{-2})$ & $\mathcal{O}(\log (n) \epsilon^{-1})$ & \cmark & \\
 \hline
 \makecell{\texttt{ProxGT-SR-O}\\ \citep{xin2021stochastic}} &  $\cO(\epsilon^{-1})$ & $\cO(n^{-1}\epsilon^{-1.5})$ & $\mathcal{O}(\log (n) \epsilon^{-1})$ & \cmark & \makecell{double-loop; \\ mean-squared smoothness}\\
 \hline
 \multirow{2}{*}{\makecell{\texttt{DEEPSTORM} \\ \citep{mancino2022proximal}}} &  $\cO(\epsilon^{-0.5})$ then $\cO(1) ^ *$ & $\cO(n^{-1}\epsilon^{-1.5})$ & $\cO(n^{-1}\epsilon^{-1.5})$ & \cmark & \multirow{2}{*}{\makecell{two time-scale; \\mean-squared smoothness; \\ double gradient evaluations \\ per iteration}}\\ \cline{2-5}
 &  $\cO(1)$ & $\cO(\epsilon^{-1.5}|\log\epsilon|^{-1.5})$ & $\cO(\epsilon^{-1.5}|\log\epsilon|^{-1.5})$ & \xmark & \\
 \hline
%\rowcolor{LightCyan}
   \texttt{Prox-DASA}  (Alg. \ref{algo: Prox-DASA}) & $\mathcal{O}\left(1\right)$ &  $\mathcal{O}(n^{-1}\epsilon^{-2})$ &  $\mathcal{O}(n^{-1} \epsilon^{-2})$ & \cmark & \makecell{bounded heterogeneity}\\
 \hline
\rowcolor{LightCyan}
   \texttt{Prox-DASA-GT} (Alg. \ref{algo: Prox-DASA-GT})  & $\mathcal{O}\left(1\right)$ &  $\mathcal{O}(n^{-1}\epsilon^{-2})$ &  $\mathcal{O}(n^{-1}\epsilon^{-2})$ & \cmark & \\
\hline
%\bottomrule
\end{tabular}
}
\footnotesize{$^*$ It requires $\cO(\epsilon^{-0.5})$ batch size in the first iteration and then $\cO(1)$ for the rest (see $m_0$ in Algorithm 1 in \cite{mancino2022proximal}).}
\end{table*}

\section{Preliminaries}

\textbf{Notations.} $\|\cdot\|$ denotes the $\ell_2$-norm for vectors and Frobenius norm for matrices. $\|\cdot\|_2$ denotes the spectral norm for matrices. $\mathbf{1}$ represents the all-one vector, and $\mI$ is the identity matrix as a standard practice. We identify vectors at agent $i$ in the subscript and use the superscript for the algorithm step. For example, the optimization variable of agent $i$ at step $k$ is denoted as $x^k_i$, and $z^k_i$ is the corresponding dual variable. We use uppercase bold letters to represent the matrix that collects all the variables from nodes (corresponding lowercase) as columns. We add an overbar to a letter to denote the average over all nodes. For example, we denote the optimization variables over all nodes at step $k$ as
$\mX_k = \left[x_{1}^{k}, \dots, x_{n}^{k}\right].$
The corresponding average over all nodes can be thereby defined as
\begin{align*}
\bar x^k &= \avein x_{i}^{k} = \frac{1}{n}\mX_k \bfone,\\
\bar \mX_k &= [\bar x^k, \dots, \bar x^k] = \bar x^k \bfonet = \frac{1}{n}\mX_k \bfone \bfonet.
\end{align*}
For an extended valued function $\Psi: \realset^d \rightarrow \realset \cup \{+\infty\}$, its effective domain is written as $\dom(\Psi) = \{x \mid \Psi(x)<+\infty\}$. A function $\Psi$ is said to be proper if $\dom(\Psi)$ is non-empty. For any proper closed convex function $\Psi$, $x\in\realset^d$, and scalar $\gamma>0$, the proximal operator is defined as
\[
\prox_{\Psi}^{\gamma}(x) = \argmin_{y\in \realset^d}\left\{\frac{1}{2\gamma}\|y - x\|^2  + \Psi(y)\right\} .
\]
For $x, z\in \realset^d$ and $\gamma > 0$, the proximal gradient mapping of $z$ at $x$ is defined as
\[
\cG(x, z, \gamma) = \frac{1}{\gamma}\left( x -  \prox_{\Psi}^{\gamma}(x-\gamma z)\right).
\]
All random objects are properly defined in a probability space $(\Omega, \setF, \mathbb{P})$ and write  $x \in \cH$ if $x$ is $\cH$-measurable given a sub-$\sigma$-algebra $\cH \subseteq \setF$ and a random vector $x$. We use $\sigma(\cdot)$ to denote the $\sigma$-algebra generated by all the argument random vectors.

\textbf{Assumptions.} Next, we list and discuss the assumptions  made in this work.

\begin{assumption}\label{aspt:gossipMatrix}
The weighted adjacency matrix $\mW=(w_{ij})\in\realset^{n\times n}$ is symmetric and doubly stochastic, i.e., $$ \mW = \mW^\top,\ \mW \mathbf{1}_n = \mathbf{1}_n,\  w_{ij}\geq 0,\ \forall i, j, $$ and its eigenvalues satisfy $1=\lambda_1 > \lambda_2 \geq \dots \geq \lambda_n$ and $\rho\coloneqq \max\{|\lambda_2|, |\lambda_n|\}<1$.
\end{assumption}

\begin{assumption}\label{aspt:lipschitz-gradient}
All functions $\{F_{i}\}_{1\leq i\leq n}$ have Lipschitz continuous gradients with Lipschitz constants $L_{\nabla F_{i}}$, respectively. Therefore, $\nabla F$ is $L_{\nabla F}$-Lipchitz continous with $L_{\nabla F} ={\max}_{1\leq i\leq n} \{L_{\nabla F_i}\}$.
\end{assumption}

\begin{assumption}\label{aspt:Psi}
The function $\Psi: \realset^d \rightarrow \realset\cup\{+\infty\}$ is a closed proper convex function.
\end{assumption}

For stochastic oracles, we assume that each node $i$ at every iteration $k$ is able to obtain a local random data vector $\xi^{k}_i$. The induced natural filtration is given by $\setF_0 = \{\emptyset, \Omega\}$ and 
\[
    \setF_k \coloneqq \sigma\left(\xi^{t}_i \mid i =1 ,\dots, n, \, t=1,\dots, k \right), \forall k\geq 1.
\]
We require that the stochastic gradient $\nabla G_i(\cdot, \xi^{k+1}_i)$ is unbiased conditioned on the filteration $\setF_k$. 
\begin{assumption}[Unbiasness]\label{aspt: Unbiasness} For any $k\geq 0, x\in \setF_k$, and $1\leq i\leq n$, $\E\left[\nabla G_i(x, \xi^{k+1}_i)\mid \setF_k\right] = \nabla F_{i}(x).$
\end{assumption}
\begin{assumption}[Independence]\label{aspt: independence} For any $k\geq 0, 1\leq i, j\leq n, i\neq j,\ \xi_i^{k+1}$ is independent of $\setF_k$, and $\xi_i^{k+1}$ is independent of $\xi_j^{k+1}$.
\end{assumption}
In addition, we consider two standard assumptions on the variance and heterogeneity of stochastic gradients.
\begin{assumption}[Bounded variance]\label{aspt: Bounded Variance} For any $k\geq 0, x\in \setF_k$, and $1\leq i\leq n$, $$\E\left[\norm{\nabla G_i(x, \xi^{k+1}_i) - \nabla F_{i}(x)}^2\middle\vert \setF_k\right] \leq \sigma^2_i.$$ Let $\sigma^2 = \frac{1}{n}\sum_{i=1}^{n} \sigma_i^2$.
\end{assumption}

\begin{assumption}[Gradient heterogeneity]\label{aspt: Gradient heterogeneity} There exists a constant $\nu\geq0$ such that for all $1 \leq
i \leq n, x\in \realset^d$, $$\norm{\nabla F_i(x) - \nabla F(x)} \leq \nu.$$
\end{assumption}

\begin{remark}[Bounded gradient heterogeneity]
 The above assumption of gradient heterogeneity is standard \cite{lian2017can} and less strict than the bounded second moment assumption on stochastic gradients which implies lipschtizness of functions $\{F_i\}$. However, this assumption is only required for the convergence analysis of \texttt{Prox-DASA} and can be bypassed by employing a gradient tracking step. 
\end{remark}

\begin{remark}[Smoothness and mean-squared smoothness]
Our theoretical results of the proposed methods are only built on the smoothness assumption on functions $\{F_i\}$ without further assuming mean-squared smoothness assumptions on $\{G_{i,\xi}\}$, which is required in all variance reduction based methods in the literature, such as \texttt{ProxGT-SR-O}~\citep{xin2021stochastic} and \texttt{DEEPSTORM}~\citep{mancino2022proximal}. It is worth noting that a clear distinction in the lower bounds of sample complexity for solving stochastic optimization under two different sets of assumptions has been proven in \citep{arjevani2023lower}. Specifically, when considering the mean-squared smoothness assumption, the optimal sample complexity is $\mathcal{O}(\epsilon^{-1.5})$, whereas under smoothness assumptions, it is $\mathcal{O}(\epsilon^{-2})$. The proposed methods in this work achieve the optimal sample complexity under our weaker assumptions.
\end{remark}



\section{Algorithm}

Several algorithms have been developed to solve Problem \eqref{eq:problem} in the stochastic setting; see Table \ref{tab:summary}. However, the most recent two types of algorithms have certain drawbacks: (i) \textbf{increasing batch sizes}: \texttt{ProxGT-SA}, \texttt{Prox-SR-O}, and \texttt{DEEPSTORM} with constant step sizes (Theorem 1 in \citep{mancino2022proximal}) require batches of stochastic gradients with batch sizes inversely proportional to tolerance $\epsilon$; (ii) \textbf{algorithmic complexities}: \texttt{ProxGT-SR-O} and \texttt{DEEPSTORM} are either double-looped or two-time-scale, and require stochastic gradients evaluated at different parameter values over the same sample, i.e., $\nabla G_i(x, \xi)$ and $\nabla G_i(x', \xi)$. These variance reduction techniques are unfavorable when gradient evaluations are computationally expensive such as forward-backward steps for deep neural networks. (iii) \textbf{theoretical weakness}: the convergence analyses of \texttt{ProxGT-SR-O} and \texttt{DEEPSTORM} are established under the \emph{stronger} assumption of mean-squared lipschtizness of stochastic gradients. In addition, Theorem 2 in \citep{mancino2022proximal} fails to provide linear-speedup results for one-sample variant of \texttt{DEEPSTORM} with diminishing stepsizes.

\subsection{Decentralized Proximal Averaged Stochastic Approximation}

To address the above limitations, we propose \textbf{D}e-centralized \textbf{Prox}imal \textbf{A}veraged \textbf{S}tochastic \textbf{A}pproximation (\texttt{Prox-DASA}) which leverages a common averaging technique in stochastic
optimation \citep{ruszczynski2008merit, mokhtari2018conditional, ghadimi2020single} to reduce the error of gradient estimation. In particular, the sequences of dual variables $\mZ^k = [ z_1^k, \dots, z_n^k]$ that aim to approximate gradients are defined in the following recursion:
\begin{equation*}
\begin{split}
    \mZ^{k+1} &= \left\{(1-\alpha_k) \mZ^{k} + \alpha_k \mV^{k+1}\right\} \mW^m\\
    \mV^{k+1} &= [v_1^{k+1}, \dots, v_n^{k+1}],
\end{split}
\end{equation*}
where each $v_i^{k+1}$ is the local stochastic gradient evaluated at the local variable $x_i^k$. For complete graphs where each pair of graph vertices is connected by an edge and there is no consensus error for optimization variables, i.e., $\mW=\frac{1}{n}\bfone\bfonet$ and $x_i^k = x_j^k, \forall i,j$, the averaged dual variable over nodes $\bar z^k$ follows the same averaging rule as in centralized algorithms:
\begin{equation*}
\begin{split}
    \bar z^{k+1} &= (1-\alpha_k) \bar z^{k} + \alpha_k \bar v^{k+1}\\
    \E[\bar v^{k+1} |\setF_k] &= \nabla F(\bar x^k).
\end{split}
\end{equation*}
To further control the consensus errors, we employ a multiple consensus step for both primal and dual iterates $\{x^k_i, z^k_i\}$ which multiply the matrix of variables from all nodes by the weight matrix $m$ times. A pseudo code of \texttt{Prox-DASA} is given in Algorithm \ref{algo: Prox-DASA}.
\begin{algorithm}[t]
    \caption{\texttt{Prox-DASA}}\label{algo: Prox-DASA}
    \SetAlgoLined
    \KwIn{$x_i^0 = z_i^0 = \mathbf{0}, \gamma, \{\alpha_k\}_{\geq 0}, m$}
    \For{$k=0, 1,\dots,K-1$}{
        \CommentSty{\# Local Update}\\
        \For{$i=1,2,\dots,n$ (in parallel)}{
            $y_i^k = \prox_{\Psi}^{\gamma}\left(x_{i}^{k} - \gamma z_{i}^{k}\right)$\\
            %$y_{i}^{k} = \argmin_{y\in \realset^d}\left\{\<z_{i}^{k}, y - x_{i}^{k}> + \frac{1}{2\gamma}\|y - x_{i}^{k}\|^2 + \Psi(y)\right\}$ \\
            $\tilde{x}_{i}^{k+1} = (1- \alpha_k)x_{i}^{k} + \alpha_ky_{i}^{k}$\\
            \CommentSty{\# Compute stochastic gradient}\\
            $v_{i}^{k+1} = \nabla G_{i}(x_{i}^{k}, \xi_{i}^{k+1})$\\
            % $\tilde u_i^{k+1} = u_i^k + v_i^{k+1} - v_i^k$ \\
            $\tilde{z}_{i}^{k+1} = (1 - \alpha_k)z_{i}^{k} + \alpha_k v_{i}^{k+1}$\\
        }
        \CommentSty{\# Communication}\\
        $[x_1^{k+1}, \dots, x_{n}^{k+1}] = [\tilde{x}_1^{k+1}, \dots, \tilde{x}_{n}^{k+1}]\mW^m$\\
        $[z_1^{k+1}, \dots, z_{n}^{k+1}] = [\tilde{z}_1^{k+1}, \dots, \tilde{z}_{n}^{k+1}]\mW^m$
    }
\end{algorithm}


\subsection{Gradient Tracking}

The constant $\nu$ defined in Assumption \ref{aspt: Gradient heterogeneity} measures the heterogeneity between local gradients and global gradients, and hence the variance of datasets of different agents. To remove $\nu$ in the complexity bound, \cite{tang2018d} proposed the $\text{D}^2$ algorithm, which modifies the $x$ update in D-PSGD \citep{lian2017can}. However, it requires one additional assumption on the eigenvalues of the mixing matrix $\mW$. Here we adopt the gradient tracking technique, which was first introduced to deterministic distributed optimization to improve the convergence rate \citep{xu2015augmented, di2016next, nedic2017achieving, qu2017harnessing}, and was later proved to be useful in removing the data variance (i.e., $\nu$) dependency in the stochastic case \citep{zhang2019decentralized, lu2019gnsd, pu2021distributed, koloskova2021improved}. In the convergence analysis of \texttt{Prox-DASA}, an essential step is to control the heterogeneity of stochastic gradients, i.e., $\E[\norm{\mV^{k+1} - \bar \mV^{k+1}}^2]$, which requires bounded heterogeneity of local gradients (Assumption \ref{aspt: Gradient heterogeneity}). To pypass this assumption, we employ a gradient tracking step by replacing $\mV^{k+1}$ with pseudo stochastic gradients $\mU^{k+1} = [u_1^{k+1}, \dots, u_n^{k+1}]$, which is updated as follows:
\begin{equation*}
    \mU^{k+1} = \left(\mU^k + \mV^{k+1} - \mV^{k}\right) \mW^m.
\end{equation*}
Provided that $\mU^0 = \mV^0$ and $\mW\bfone = \bfone$, one can show that $\bar u^k = \bar v^k$ at each step $k$. In addition, with the consensus procedure over $\mU^k$, the heterogeneity of pseudo stochastic gradients $\E[\norm{\mU^{k+1} - \bar \mU^{k+1}}^2]$ can be bounded above. The proposed algorithm, named as \texttt{Prox-DASA} with Gradient Tracking (\texttt{Prox-DASA-GT}), is presented in Algorithm \ref{algo: Prox-DASA-GT}.
\begin{algorithm}[t]
    \caption{\texttt{Prox-DASA-GT}}\label{algo: Prox-DASA-GT}
    \SetAlgoLined
    \KwIn{$x_i^0 = z_i^0 = u_i^0 = \mathbf{0}, \gamma, \{\alpha_k\}_{\geq 0}, m$}
    \For{$k=0, 1,\dots,K$}{
        \CommentSty{\# Local Update}\\
        \For{$i=1,2,\dots,n$ (in parallel)}{
            $y_{i}^{k} = \prox_{\Psi}^{\gamma}\left(x_{i}^{k} - \gamma z_{i}^{k}\right)$\\
            $\tilde{x}_{i}^{k+1} = (1- \alpha_k)x_{i}^{k} + \alpha_ky_{i}^{k}$\\
            \CommentSty{\# Compute stochastic gradient}\\
            $v_{i}^{k+1} = \nabla G_{i}(x_{i}^{k}, \xi_{i}^{k+1})$\\
            $\tilde u_i^{k+1} = u_i^k + v_i^{k+1} - v_i^k$ \\
            $\tilde{z}_{i}^{k+1} = (1 - \alpha_k)z_{i}^{k} + \alpha_k \tilde{u}_{i}^{k+1}$\\
        }
        \CommentSty{\# Communication}\\
        $[x_1^{k+1}, \dots, x_{n}^{k+1}] = [\tilde{x}_1^{k+1}, \dots, \tilde{x}_{n}^{k+1}]\mW^m$\\
        $[u_1^{k+1}, \dots, u_{n}^{k+1}] = [\tilde{u}_1^{k+1}, \dots, \tilde{u}_{n}^{k+1}]\mW^m$\\
        $[z_1^{k+1}, \dots, z_{n}^{k+1}] = [\tilde{z}_1^{k+1}, \dots, \tilde{z}_{n}^{k+1}]\mW^m$
    }
\end{algorithm}

\subsection{Consensus Algorithm}
In practice, we can leverage accelerated consensus algorithms, e.g., \cite{liu2011accelerated, olshevsky2017linear}, to speed up the multiple consensus step $\mW^m$ to achieve improved communication complexities when $m>1$. Specifically, we can replace $\mW^m$ by a Chebyshev-type polynomial of $\mW$ as described in Algorithm \ref{algo: acc-consensus}, which can improve the $\rho$-dependency of the communication complexity from a factor of $\frac{1}{1-\rho}$ to $\frac{1}{\sqrt{1-\rho}}$.

\begin{algorithm}[th]
    \caption{Chebyshev Mixing Protocol}\label{algo: acc-consensus}
    \SetAlgoLined
    \KwIn{Matrix $\mX$, mixing matrix $\mW$, rounds $m$}
    Set $\mA_0= \mX, \mA_1 = \mX\mW, \rho=\max\{|\lambda_2(\mW)|, |\lambda_n(\mW)|\}<1, \mu_0 = 1, \mu_1 = \frac{1}{\rho}$\\
    \For{$t=1,\dots,m-1$}{
        $\mu_{t+1} = \frac{2}{\rho}\mu_t - \mu_{t-1}$\\
        $\mA_{t+1} = \frac{2\mu_t}{\rho\mu_{t+1}}\mA_t \mW - \frac{\mu_{t-1}}{\mu_{t+1}} \mA_{t-1}$
    }
    \KwOut{$\mA_m$}
\end{algorithm}

\section{Convergence Analysis}

\subsection{Notion of Stationarity}

For centralized optimization problems with non-convex objective function $F(x)$, a standard measure of non-stationarity of a point $\bar x$ is the squared norm of proximal gradient mapping of $\nabla F(\bar x)$ at $\bar x$, i.e.,
\begin{equation*}
\norm{\cG(\bar x, \nabla F(\bar x), \gamma)}^2 = \norm{\frac{1}{\gamma}\left(x - \prox_{\Psi}^{\gamma}(\bar x-\gamma \nabla F(\bar x))\right)}^2.
\end{equation*}
For the smooth case where $\Psi(x)\equiv 0$, the above measure is reduced to $\norm{\nabla F(\bar x)}^2$.

However, in the decentralized setting with a connected network $\mathbb{G}$, we solve the following equivalent reformulated consensus optimization problem:
\begin{equation}\label{eq:problem-consensus}
\begin{split}
    \underset{x_1,\dots,x_n\in\realset^d}{\min}& \quad \frac{1}{n} \sum_{i=1}^{n}\left\{ F_i(x_i) + \Psi(x_i)\right\}\\
    \text{s.t.}\quad &\quad    x_i = x_j,\ \forall (i,j).
\end{split}
\end{equation}
To measure the non-stationarity in Problem \eqref{eq:problem-consensus},  one should consider not only the stationarity violation at each node but also the consensus errors over the network. Therefore, \citet{xin2021stochastic} and \citet{mancino2022proximal} define an $\epsilon$-stationary point $\mX= [x_1, \dots, x_n]$ of Problem \ref{eq:problem-consensus} as
\begin{equation}\label{def: previous stationarity}
\E\left[\frac{1}{n}\sum_{i=1}^{n}\left\{\norm{\cG(x_i, \nabla F(x_i), \gamma)}^2 + L_{\nabla F}^2\norm{x_i - \bar x}^2\right\}\right] \leq \epsilon.    
\end{equation}
In this work, we use a general measure as follows.
\begin{definition}\label{def: stat&cons}
    Let $\mX = [x_1, \dots, x_n]$ be random vectors generated by a decentralized algorithm to solve Problem \ref{eq:problem-consensus} and $\bar x = \frac{1}{n}\sum_{i=1}^{n}x_i$. We say that $\mX$ is an $\epsilon$-stationary point of Problem \ref{eq:problem-consensus} if 
    \begin{align}
    &\text{(stationarity violation)} &\E\left[\norm{\cG(\bar x, \nabla F(\bar x), \gamma)}^2\right] \leq \epsilon, \notag\\
    &\text{(consensus error)} &\textstyle \E\left[\frac{L_{\nabla F}^2}{n}\norm{\mX - \bar \mX}^2\right] \leq \epsilon \notag.
    \end{align}
\end{definition}
The next inequality characterizes the difference between the gradient mapping at $\bar x$ and $x_i$, which relates our definition to \eqref{def: previous stationarity}. Noting that by non-expansiveness of the proximal operator, we have $\norm{\cG(x_i, \nabla F(x_i), \gamma) - \cG(\bar x, \nabla F(\bar x), \gamma)}\leq \tfrac{2+\gamma L_{\nabla F}}{\gamma}\norm{x_i - \bar x}$, implying 
\begin{align*}
    \frac{1}{n}&\sum_{i=1}^{n}\norm{\cG(x_i, \nabla F(x_i), \gamma)}^2 \\
    &\lesssim \norm{\cG(\bar x, \nabla F(\bar x), \gamma)}^2 + \tfrac{1}{\gamma^2 n}\norm{\mX - \bar \mX}^2.
\end{align*}




\subsection{Main Results}

We present the complexity results of our algorithms below.

\begin{theorem}\label{thm: main}
    Suppose Assumptions \ref{aspt:gossipMatrix}, \ref{aspt:lipschitz-gradient}, \ref{aspt:Psi}, \ref{aspt: Unbiasness}, \ref{aspt: independence}, \ref{aspt: Bounded Variance} hold and the total number of iterations $K\geq K_0$, where $K_0$ is a constant that only depends on constants $(n, L_{\nabla F}, \varrho(m), \gamma)$, where $\varrho(m) = \tfrac{(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}$. Let $C_0$ be some initialization-dependent constant and $R$ be a random integer uniformly distributed over $\{1, 2, \dots, K\}$. Suppose we set $\alpha_k \asymp \sqrt{\tfrac{n}{K}}, \gamma \asymp \tfrac{1}{L_{\nabla F}}$.
    \begin{itemize}[leftmargin=0pt]
        \item[] \textbf{(Prox-DASA)} Suppose in addition Assumption \ref{aspt: Gradient heterogeneity} also holds. The, for  Algorithm \ref{algo: Prox-DASA} we have
        \begin{align*}
            &\E\left[\norm{\cG(\bar x^R, \nabla F(\bar x^R), \gamma)}^2\right] \\
            &\qquad \lesssim \frac{\gamma^{-1} C_0 + \sigma^2}{\sqrt{nK}} + \frac{n(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)}{K},\\
            &\E\left[\norm{\bar z^R - \nabla F(\bar x^R))}^2\right] \\
            &\qquad \lesssim \frac{\gamma^{-1} C_0 + \sigma^2}{\sqrt{nK}} + \frac{n(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)}{K},\\
            &\E\left[\frac{L_{\nabla F}^2}{n}\norm{\mX_R - \bar \mX_R}^2+ \frac{1}{n}\norm{\mZ_R - \bar \mZ_R}^2\right]\\
            &\qquad \lesssim \frac{n(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)}{K}.
        \end{align*}

        \item[] \textbf{(Prox-DASA-GT)} For Algorithm \ref{algo: Prox-DASA-GT} we have 
        \begin{align*}
            &\E\left[\norm{\cG(\bar x^R, \nabla F(\bar x^R), \gamma)}^2\right] \lesssim \frac{\gamma^{-1} C_0 + \sigma^2}{\sqrt{nK}} + \frac{n\sigma^2\varrho(m)}{K},\\
            &\E\left[\norm{\bar z^R - \nabla F(\bar x^R)}^2\right] \lesssim \frac{\gamma^{-1} C_0 + \sigma^2}{\sqrt{nK}} + \frac{n\sigma^2\varrho(m)}{K},\\
            &\E\left[\frac{L_{\nabla F}^2}{n}\norm{\mX_R - \bar \mX_R}^2 + \frac{1}{n}\norm{\mZ_R - \bar \mZ_R}^2\right] \lesssim \frac{n\sigma^2\varrho(m)}{K}.
        \end{align*}
    \end{itemize}
\end{theorem}
In Theorem \ref{thm: main} for simplicity we assume $\gamma \asymp \frac{1}{L_{\nabla F}}$, which can be relaxed to $\gamma > 0$. We have the following corollary characterizing the complexity of Algorithm \ref{algo: Prox-DASA} and \ref{algo: Prox-DASA-GT} for finding $\epsilon$-stationary points. The proof is immediate.
\begin{corollary}\label{cor: complexity}
    Under the same conditions of Theorem \ref{thm: main}, provided that $K\gtrsim n^3 \varrho(m)$, for any $\epsilon > 0$ the sample complexity per agent for finding $\epsilon$-stationary points
    in Algorithm \ref{algo: Prox-DASA} and \ref{algo: Prox-DASA-GT} are $\cO(\max\{n^{-1}\epsilon^{-2}, K_T\})$ where the transient time $K_T \asymp \max\{K_0, n^3 \varrho(m)\}$.
\end{corollary}


\begin{remark}[Sample complexity]
    For a sufficiently small $\epsilon>0$, Corrollary \ref{cor: complexity} implies that the sample complexity of Algorithm \ref{algo: Prox-DASA} and \ref{algo: Prox-DASA-GT}  matches the optimal lower bound $\cO(n^{-1}\epsilon^{-2})$ in decentralized \emph{smooth} stochastic non-convex optimization \citep{lu2021optimal}.
\end{remark}

\begin{remark}[Transient time and communication complexity]
    Our algorithms can achieve convergence with a single communication round per iteration, i.e., $m=1$, leading to a topology-independent $\cO(n^{-1}\epsilon^{-2})$ communication complexity. In this case, however, the transient time $K_T$ still depends on $\rho$, as is also the case for smooth optimization problems \citep{xin2021improved}. When considering multiple consensus steps per iteration with the communication complexity being $\cO(mn^{-1}\epsilon^{-2})$, setting $m\asymp \lceil \tfrac{1}{1-\rho}\rceil$ (or $m\asymp \lceil \tfrac{1}{\sqrt{1-\rho}}\rceil$ for accelerated consensus algorithms) results in a topology-independent transient time given that $\varrho(m) \asymp 1$.
\end{remark}

\begin{remark}[Dual convergence]
    An important aspect to emphasize is that in our proposed methods, the sequence of average dual variables $\bar z^k = \frac{1}{n}\sum_{i=1}^{n} z_i^k$ converges to $\nabla F(\bar x^k)$, while the consensus error of $\{z_1^k, \dots, z_n^k\}$ decreases to zero. Our approach achieves this desirable property, which is commonly observed in modern variance reduction methods~\citep{gower2020variance}, without the need for complex variance reduction operations in each iteration. As a result, it provides a reliable termination criterion in the stochastic setting without requiring large batch sizes.
\end{remark}




\subsection{Proof Sketch}
\label{sec: proof_sketch}
Here, we present a sketch of our convergence analyses and defer details to Appendix. Our proof relies on the merit function below:
\begin{equation*}
\begin{split}
    W(\bar x^k,\bar z^k) = &\underbrace{\Phi(\bar x^{k}) - \Phi_*}_{\text{function value gap}} + \underbrace{\Psi(\bar x^k) - \eta(\bar x^{k}, \bar z^{k})}_{\text{primal convergence}} \\
    &+ \lambda \underbrace{\norm{\nabla F(\bar x^k) - \bar z^k}^2}_{\text{dual convergence}},
\end{split}
\end{equation*}
where $\eta(x, z) = \underset{y\in \realset^d}{\min}\left\{\<z,y-x> + \frac{1}{2\gamma}\|y-x\|^2 + \Psi(y)\right\}.$
Let $y^k_+ \coloneqq \prox_{\Psi}^{\gamma}\left(\bar x^k - \gamma \bar z^k\right)$. Then, the proximal gradient mapping of $\bar z^k$ at $\bar x^k$ is $\cG(\bar x^k, \bar z^k, \gamma) = \frac{1}{\gamma}(\bar x^k - y^k_+). $
Since $y^{k}_{+}$ is the minimizer of a $1/\gamma$-strongly convex function, we have
\begin{equation*}
\begin{split}
    \<\bar z^k, y^k_+ - \bar x^k> + &\frac{1}{2\gamma}\|y^k_+ - \bar x^k\|^2 +\Psi(y^k_+) \\
    &\leq \Psi(\bar x^k)  - \frac{1}{2\gamma}\|y^k_+ - \bar x^k\|^2,
\end{split}
\end{equation*}
implying the relation between $\Psi(\bar x^k) - \eta(\bar x^k, \bar z^k)$ and primal convergence: $$\Psi(\bar x^k) - \eta(\bar x^k, \bar z^k) \geq \frac{\gamma}{2} \norm{\cG(\bar x^k, \bar z^k, \gamma)}^2.$$

Following standard practices in optimization,  we set $\gamma = \frac{1}{L_{\nabla F}}$ below for simplicity. However, our algorithms do not require any restriction on the choice of $\gamma$.

\textbf{Step 1:} Leveraging the merit function with $\lambda \asymp \gamma$, we can first obtain an essential lemma (Lemma 11 in Appendix) in our analyses, which says that for sequences $\{x_{i}^k, z_i^k\}_{1\leq i\leq n, k\geq 0}$ generated by \texttt{Prox-DASA(-GT)} (Algorithm \ref{algo: Prox-DASA} or \ref{algo: Prox-DASA-GT}) with $\alpha_k\lesssim \min\{1, (1+\gamma)^{-2}, \gamma^2(1+\gamma)^{-2}\}$, we have
\begin{equation*}
\begin{split}
     W(\bar x^{k+1}, \bar z^{k+1}) - &W(\bar x^{k}, \bar z^{k}) \\
     &\leq - \alpha_k \left\{\Theta^k  + \Upsilon^k + \alpha_k \Lambda^k + r^{k+1}\right\},
\end{split}
\end{equation*}
where $\E[r^{k+1}\mid\setF_k] = 0$, $\Lambda^k \asymp \gamma\norm{\bar \Delta^{k+1}}^2$, 
\begin{equation*}
\begin{split}
    &\Theta^k \asymp \frac{1}{\gamma} \|\bar x^k  - \bar y^k\|^2 + \gamma \norm{\nabla F(\bar x^k) - \bar z^k}^2, \\
    &\Upsilon^k \asymp \frac{\gamma}{n}\norm{\mZ_k - \bar \mZ_k}^2 + \frac{1}{n\gamma}\norm{\mX_k - \bar \mX_k}^2,
\end{split}
\end{equation*}
and $\bar\Delta^{k+1} = \bar v^{k+1} - \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i) = \bar u^{k+1} - \frac{1}{n}\sum_{i=1}^{n} \nabla F_i(x^k_i)$ (for \texttt{Prox-DASA-GT}). Thus, by telescoping and taking expectation with respect to $\setF_0$, we have
\begin{equation}\label{ineq: sketch-main-ineq}
\begin{split}
    &\sum_{k=0}^{K} \alpha_k\E\left[\norm{\bar x^k - \bar y^k}^2 + \gamma^2\norm{\nabla F(\bar x^k) - \bar z^k}^2\right]\\
    &\lesssim  \gamma W(\bar x^0, \bar z^0)+ \gamma^2\sigma^2 \boxed{\sum_{k=0}^{K}\frac{\alpha_k^2}{n}} \\
    &\quad + \sum_{k=0}^{K} \frac{\alpha_k\left\{\E\left[\norm{\mX_k - \bar \mX_k}^2 + \gamma^2\norm{\mZ_k - \bar \mZ_k}^2\right]\right\}}{n}.
\end{split}
\end{equation}

\textbf{Step 2:} We then analyze the consensus errors. Without loss of generality, we consider $\mX_0 = \bar \mX_0 = \mathbf{0}$, i.e., all nodes have the same initialization at $\mathbf{0}$. For $m\in\mathbb{N}_+$, define
$$\varrho(m) = \frac{(1+\rho^{2m})\rho^{2m}}{(1-\rho^{2m})^2}.$$ Then, we have the following fact:
\begin{itemize}
    \item $\varrho(m)$ is monotonically decreasing with the maximum value being $\varrho(1) = \frac{(1+\rho^{2})\rho^{2}}{(1-\rho^{2})^2}:= \varrho_1$;
    \item $\varrho(m)\leq 1$ if and only if $\rho^{2m}\leq \frac{1}{3}$.
\end{itemize}
With the definition of $\varrho(m)$ and assuming $0<\alpha_{k+1}\leq \alpha_k\leq 1$, we can show the consensus errors have the following upper bounds.

\texttt{Prox-DASA}: Let $\alpha_k \lesssim \varrho(m)^{-\frac{1}{2}}$, we have
\begin{align}
&\sum_{k=0}^{K}\frac{\alpha_k}{n} \E\left[\norm{\mX_{k} - \bar \mX_k}^2 \right] \leq \sum_{k=0}^{K}\frac{\gamma^2\alpha_k}{n} \E\left[\norm{\mZ_{k} - \bar \mZ_k}^2\right] \notag\\
&\hspace{0.1\textwidth}\lesssim (\gamma^2\sigma^2 + \nu^2)\varrho(m)\boxed{\sum_{k=0}^{K}\alpha_k^{3}}.\label{ineq: prox-dasa-consensus}
\end{align}

\texttt{Prox-DASA-GT}: Let $\alpha_k \lesssim \min\{\varrho(m)^{-1}, \varrho(m)^{-\frac{1}{2}}\}$, we have
\begin{align}
    &\sum_{k=0}^{K}\frac{\alpha_k}{n}\E\left[\|\mX_k - \bar \mX_k\|^2\right]\leq \sum_{k=0}^{K}\frac{\gamma^2\alpha_k}{n}\E\left[\|\mZ_k - \bar \mZ_k\|^2\right] \notag\\
    &\lesssim  \varrho(m)^2 \boxed{\sum_{k=0}^{K}\alpha_k^{3}}\left\{\gamma^2\sigma^2 + \alpha_k^2\E\left[\|\bar x^k - \bar y^k\|^2\right] \right\}.\label{ineq: sketch-prox-dasa-gt-consensus}
\end{align}
We can also see that to obtain a topology-independent iteration complexity, the number of communication rounds can be set as $m = \lceil \frac{\log 3}{2(1-\rho)} \rceil$, which implies $\varrho(m) \leq 1$.

In addition, we have the following fact that relates the consensus error of $\mY$ to the consensus errors of $\mX$ and $\mZ$:
\begin{equation*}
\begin{split}
        &\norm{y^k_+ - \bar y^k}^2 + \frac{1}{n} \norm{\mY_k - \bar \mY_k}^2 = \frac{1}{n} \sum_{i=1}^{n} \norm{y_i^k - y^k_+}^2 \\
        &\quad \leq \frac{2}{n} \left\{\|\mX_{k} - \bar \mX_k\|^2 + \gamma^2\|\mZ_{k} - \bar \mZ_k\|^2 \right\}.
\end{split}
\end{equation*}


\textbf{Step 3:} Let $R$ be a random integer with 
\[
\Pr(R = k)=\frac{\alpha_k}{\sum_{k=1}^{K}\alpha_k}, \quad k=1,2,\dots, K,
\]
and dividing both sides of \eqref{ineq: prox-dasa-consensus} by $\sum_{k=1}^{K}\alpha_k$, we can obtain that for \texttt{Prox-DASA}, the consensus error of $\mX_R$ satisfies
\begin{equation*}
    \E\left[\frac{1}{n}\norm{\mX_R - \bar \mX_R}^2\right] \lesssim (\gamma^2\sigma^2 +\nu^2)\varrho(m)\frac{\sum_{k=0}^{K} \alpha_k^3}{\sum_{k=1}^{K} \alpha_k}.
\end{equation*}
Moreover, noting that
\begin{equation*}
\begin{split}
    \norm{\cG(\bar x, \nabla F(\bar x), \gamma)}^2 \lesssim &\frac{1}{\gamma^2} \left\{\norm{\bar x^k - \bar y^k}^2 
    + \norm{y^k_+ - \bar y^k}^2\right\} \\
    &\quad +\norm{\nabla F(\bar x^k) - \bar z^k}^2,
\end{split}
\end{equation*}
and combining \eqref{ineq: sketch-main-ineq} with \eqref{ineq: prox-dasa-consensus}, we can get
\begin{equation*}
    \begin{split}
    &\E\left[\norm{\cG(\bar x^R, \nabla F(\bar x^R), \gamma)}^2\right] \lesssim \underbrace{\boxed{\frac{W(\bar x^0, \bar z^0)}{\gamma\sum_{k=1}^{K}\alpha_k}}}_{\text{initialization-related term}} \\
    &+ \underbrace{\boxed{\sigma^2 \frac{\sum_{k=0}^{K}\alpha_k^2}{n\sum_{k=1}^{K}\alpha_k}}}_{\text{variance-related term}} + \underbrace{\boxed{(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)\frac{\sum_{k=0}^{K} \alpha_k^3}{\sum_{k=1}^{K} \alpha_k}}}_{\text{consensus error}}.
    \end{split}
\end{equation*}
Thus, setting $\alpha_k \asymp \sqrt{\frac{n}{K}} $, we obtain the convergence results of \texttt{Prox-DASA}:

\begin{equation*}
    \begin{split}
    &\E\left[\norm{\cG(\bar x^R, \nabla F(\bar x^R), \gamma)}^2\right] \\
    &\quad \lesssim \frac{\gamma^{-1} W(\bar x^0 , \bar z^0) + \sigma^2}{\sqrt{nK}} + \frac{n(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)}{K},\\
    &\E\left[\frac{1}{\gamma^2 n}\norm{\mX_R - \bar \mX_R}^2\right] \lesssim \frac{n(\sigma^2 +\gamma^{-2}\nu^2)\varrho(m)}{K}.
    \end{split}
\end{equation*}

For \texttt{Prox-DASA-GT}, we can complete the proof with similar arguments by combining \eqref{ineq: sketch-prox-dasa-gt-consensus} with \eqref{ineq: sketch-main-ineq} and noting that $\varrho(m)^2 \alpha_k^4 \lesssim 1$.

\section{Experiments}

\subsection{Synthetic Data}


To demonstrate the effectiveness of our algorithms, we first evaluate our algorithms using synthetic data for solving sparse single index models \citep{alquier2013sparse} in the decentralized setting. We consider the homogeneous setting where the data sample at each node $\xi=(X, Y)$ is generated from the same single index model $Y = g(X^\top \theta_*) + \varepsilon$, where $X, \theta \in \realset^d$ and $\E[ \varepsilon| X]=0$. In this case, we solve the following $L_1$-regularized least square problems:
\begin{equation*}
    \underset{\theta\in \realset^d}{\min}~ \frac{1}{n}\sum_{i=1}^{n} \underset{(X, Y)\sim \cD}{\E}\left[(Y - g(X^\top\theta))^2\right] + \lambda \norm{\theta}_1
\end{equation*}
In particular, we set $\theta_*\in\realset^{100}$ to be a sparse vector and $g(\cdot) = (\cdot)^2$ which corresponds to the sparse phase retrieval problem \citep{jaganathan2016phase}. We simulate streaming data samples with batch size $=1$ for training and 10,000 data samples per node for evaluations, where $X$ and $\epsilon$ are sampled independently from two Gaussian distributions. We employ a ring topology for the network where self-weighting and neighbor weights are set to be $1/3$. We set the penalty parameter $\lambda = 0.01$, the total number of iterations $K=10,000$, $\alpha_k = \sqrt{n/K}$, $\gamma=0.01$, and the number of communication rounds per iteration $m=\lceil \frac{1}{1-\rho} \rceil$. We plot the test loss and the norm of proximal gradient mapping in the log scale against the number of iterations in Figure \ref{fig:linear-speedup}, which shows that our decentralized algorithms have an additional linear speed-up with respect to $n$. In other words, the algorithms become faster as more agents are added to the network.
\begin{figure}[!t]
    \centering
    \includegraphics[width=.49\textwidth]{Figures/linear-speedup.pdf}
    \caption{Linear-speedup performance of \texttt{Prox-DASA} for decentralized online sparse phase retrievel problems. (\texttt{Prox-DASA-GT} has relatively the same plots)}
    \label{fig:linear-speedup}
\end{figure}

\subsection{Real-World Data}\label{sec: real_data_exp}
\begin{figure*}
    \centering
    \subfigure[]{\label{a9a_acc_time}\includegraphics[width=0.32\textwidth]{Figures/a9a_acc_time.pdf}}
    \subfigure[]{\label{a9a_loss_time}\includegraphics[width=0.32\textwidth]{Figures/a9a_train_loss_time.pdf}}
    \subfigure[]{\label{a9a_stat_time}\includegraphics[width=0.32\textwidth]{Figures/a9a_stat_time.pdf}}
    
    \subfigure[]{\label{a9a_acc_epo}\includegraphics[width=0.32\textwidth]{Figures/a9a_acc_epo.pdf}}
    \subfigure[]{\label{a9a_loss_epo}\includegraphics[width=0.32\textwidth]{Figures/a9a_train_loss_epo.pdf}}
    \subfigure[]{\label{a9a_stat_epo}\includegraphics[width=0.32\textwidth]{Figures/a9a_stat_epo.pdf}}

    \subfigure[]{\label{mnist_acc_time}\includegraphics[width=0.32\textwidth]{Figures/mnist_acc_time.pdf}}
    \subfigure[]{\label{mnist_loss_time}\includegraphics[width=0.32\textwidth]{Figures/mnist_train_loss_time.pdf}}
    \subfigure[]{\label{mnist_stat_time}\includegraphics[width=0.32\textwidth]{Figures/mnist_stat_time.pdf}}
    
    \subfigure[]{\label{mnist_acc_epo}\includegraphics[width=0.32\textwidth]{Figures/mnist_acc_epo.pdf}}
    \subfigure[]{\label{mnist_loss_epo}\includegraphics[width=0.32\textwidth]{Figures/mnist_train_loss_epo.pdf}}
    \subfigure[]{\label{mnist_stat_epo}\includegraphics[width=0.32\textwidth]{Figures/mnist_stat_epo.pdf}}
    \caption{Comparisons between \texttt{SPPDM} \citep{wang2021distributed}, \texttt{ProxGT-SR-E} \citep{xin2021stochastic}, \texttt{DEEPSTORM} \citep{mancino2022proximal}, \texttt{Prox-DASA} \ref{algo: Prox-DASA}, and \texttt{Prox-DASA-GT} \ref{algo: Prox-DASA-GT}. The first two rows correspond to a9a and the last two rows correspond to MNIST. The results are averaged over 10 trials, and the shaded regions represent confidence intervals. The vertical axes in the third column are log-scale. It should be noted that \texttt{ProxGT-SR-E} maintains another hyperparameter $q$ (see, e.g., Algorithm 4 and Theorem 3 in \citep{xin2021stochastic}) and computes gradients using a full batch every $q$ iterations. For simplicity, we do not include that amount of epochs when we plot this figure. In other words, the real number of epochs required to obtain a point on \texttt{ProxGT-SR} is larger than plotted in the figures in the second and fourth rows. We include the plots that take $q$ into account in Appendix.}\label{fig: comparison} %Figure \ref{fig: full_appendix}
\end{figure*}
Following \cite{mancino2022proximal}, we consider solving the classification problem 
\begin{equation}\label{eq:exp_opt}
    \underset{\theta\in\realset^d}{\min}~ \frac{1}{n} \sum_{i=1}^{n} \frac{1}{|\mathcal{D}_i|}\sum_{(x, y)\in \mathcal{D}_i}\ell_i(f(x;\theta),y) + \lambda \|\theta\|_1,
\end{equation}
on a9a and MNIST datasets\footnote{Available at https://www.openml.org.}. Here, $\ell_i$ denotes the cross-entropy loss, and $f$ represents a neural network parameterized by $\theta$ with $x$ being its input. $\mathcal{D}_i$ is the training set only available to agent $i$. The $L_1$ regularization term is used to impose a sparsity structure on the neural network. We use the code in \cite{mancino2022proximal} for \texttt{SPPDM}, \texttt{ProxGT-SR-O/E}, \texttt{DEEPSTORM}, and then implement \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} under their framework, which mainly utilizes PyTorch \citep{NEURIPS2019_9015} and mpi4py \citep{dalcin2021mpi4py}.  We use a 2-layer perception model on a9a and the LeNet architecture \citep{lecun2015lenet} for the MNIST dataset. We have 8 agents ($n=8$) which connect in the form of a ring for a9a and a random graph for MNIST. To demonstrate the performance of our algorithms in the constant batch size setting, the batch size is chosen to be 4 for a9a and 32 for MNIST for all algorithms. The learning rates provided in the code of \cite{mancino2022proximal} are adjusted accordingly, and we select the ones with the best performance. For \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} we choose a diminishing stepsize sequence, namely, $\alpha_k =  \min\left\{\alpha\sqrt{\frac{n}{k}}, 1\right\}$ for all $k\geq 0$. Note that the same complexity (up to logarithmic factors) bounds can be obtained by directly plugging in the aforementioned expressions for $\alpha_k$ in Section \ref{sec: proof_sketch}. Then we tune $\gamma\in \left\{1, 3, 10\right\}$ and $\alpha\in \left\{0.3, 1.0, 3.0\right\}$. The penalty parameter $\lambda$ is chosen to be $0.0001$ for all experiments. The number of communication rounds per iteration $m$ is set to be $1$ for all algorithms. We evaluate the model performance periodically during training and then plot the results in Figure \ref{fig: comparison}, from which we observe that both \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} have considerably good performance with small variance in terms of test accuracy, training loss, and stationarity. In particular, it should be noted that although \texttt{DEEPSTORM} achieves better stationarity in Figure \ref{mnist_stat_epo} and \ref{mnist_stat_time}, training a neural network by using \texttt{DEEPSTORM} takes longer time than \texttt{Prox-DASA} and \texttt{Prox-DASA-GT} since it uses the momentum-based variance reduction technique, which
requires {\bf two forward-backward passes} (see, e.g., Eq. (10) and Algorithm 1 in \cite{mancino2022proximal}) to compute the gradients in one iteration per agent. In contrast, ours only require {\bf one}, which saves a large amount of time (see Table 1 in Appendix). We include further details of our experiments in the Appendix. %\ref{sec: exp_details}.

\section{Conclusion}

In this work, we propose and analyze a class of single time-scale decentralized proximal algorithms (\texttt{Prox-DASA-(GT)}) for non-convex stochastic composite optimization in the form of \eqref{eq:problem}. We show that our algorithms achieve linear speed-up with respect to the number of agents using an $\cO(1)$ batch size per iteration under mild assumptions. Furthermore, we demonstrate the efficiency and effectiveness of our algorithms through extensive experiments, in which our algorithms achieve relatively better results with less training time using a small batch size compared to existing methods. In future research, it would be intriguing to expand our work in the context of dependent and heavy-tailed stochastic gradient scenarios \citep{wai2020convergence, li2022high}.

\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    TX and XC contributed equally to the paper. 
    TX was responsible for conceptualizing the idea and writing the paper. TX and XC worked together to complete the proof.
    XC took charge of creating the code and conducting the experiments. The paper was further revised by KB and SG.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank the authors of~\citep{mancino2022proximal} for kindly providing the code framework to support our experiments.
    The research of KB is supported by NSF grant DMS-2053918. The research of SG is partially supported by NSERC grant RGPIN-2021-02644.

\end{acknowledgements}

% References
\bibliography{xiao_524}

% \appendix
% \onecolumn
% \input{Appendix.tex}

\end{document}
