%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{enumitem}
\usepackage{multirow}

\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm}
\usepackage{algorithmic}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%for convinience
\newcommand{\vct}{\boldsymbol }
%\newcommand{\mat}{\mathbf}
\newcommand{\rnd}{\mathsf}
\newcommand{\ud}{\mathrm d}
\newcommand{\nml}{\mathcal{N}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\hinge}{\mathcal{R}}
\newcommand{\kl}{\mathrm{KL}}
\newcommand{\cov}{\mathrm{cov}}
\newcommand{\dir}{\mathrm{Dir}}
\newcommand{\mult}{\mathrm{Mult}}
\newcommand{\err}{\mathrm{err}}
\newcommand{\sgn}{\mathrm{sgn}}
%\renewcommand{\span}{\mathrm{span}}
\newcommand{\argmin}{\mathrm{argmin}}
\newcommand{\argmax}{\mathrm{argmax}}
\newcommand{\poly}{\mathrm{poly}}
\newcommand{\rank}{\mathrm{rank}}
\newcommand{\conv}{\mathrm{conv}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\diag}{\mat{diag}}
\newcommand{\acc}{\mathrm{acc}}

\newcommand{\aff}{\mathrm{aff}}
\newcommand{\range}{\mathrm{Range}}
\newcommand{\Sgn}{\mathrm{sign}}

\newcommand{\hit}{\mathrm{hit}}
\newcommand{\cross}{\mathrm{cross}}
\newcommand{\Left}{\mathrm{left}}
\newcommand{\Right}{\mathrm{right}}
\newcommand{\Mid}{\mathrm{mid}}
\newcommand{\bern}{\mathrm{Bernoulli}}
\newcommand{\ols}{\mathrm{ols}}
\newcommand{\tr}{\mathrm{tr}}
\newcommand{\opt}{\mathrm{opt}}
\newcommand{\ridge}{\mathrm{ridge}}
\newcommand{\unif}{\mathrm{unif}}
\newcommand{\Image}{\mathrm{im}}
\newcommand{\Kernel}{\mathrm{ker}}
\newcommand{\supp}{\mathrm{supp}}
\newcommand{\pred}{\mathrm{pred}}
\newcommand{\distequal}{\stackrel{\mathbf{P}}{=}}
%\newcommand{\gege}{\textcircled{1}}
\newcommand{\gege}{{A(\vect{w},\vect{w}_*)}}
\newcommand{\gele}{{A(\vect{w},-\vect{w}_*)}}
\newcommand{\lele}{{A(-\vect{w},-\vect{w}_*)}}
\newcommand{\lege}{{A(-\vect{w},\vect{w}_*)}}
\newcommand{\firstlayer}{\mathbf{W}}
\newcommand{\firstlayerWN}{v}
\newcommand{\secondlayer}{a}
\newcommand{\inputvar}{\vect{x}}
\newcommand{\anglemat}{\mathbf{\Phi}}
\newcommand{\holder}{H\"{o}lder }

\def\R{\mathbb{R}}
\def\Z{\mathbb{Z}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cH{\mathcal{H}}
\def\cI{\mathcal{I}}
\def\cL{\mathcal{L}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cP{\mathcal{P}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cW{\mathcal{W}}
\def\cZ{\mathcal{Z}}
\def\bP{\mathbf{P}}
\def\TV{\mathrm{TV}}
\def\MSE{\mathrm{MSE}}

\def\vw{\mathbf{w}}
\def\va{\mathbf{a}}
\def\vZ{\mathbf{Z}}

\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\vect}[1]{\mathbf{#1}}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\inner}[1]{\left\langle#1\right\rangle}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\expect}{\mathbb{E}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\prox}[2]{\textbf{Prox}_{#1}\left\{#2\right\}}
\newcommand{\event}[1]{\mathscr{#1}}
\newcommand{\set}[1]{#1}
\newcommand{\diff}{\text{d}}
\newcommand{\difference}{\triangle}
\newcommand{\inputdist}{\mathcal{Z}}
\newcommand{\indict}{\mathbb{I}}
\newcommand{\rotmat}{\mathbf{R}}
\newcommand{\normalize}[1]{\overline{#1}}
\newcommand{\vectorize}[1]{\text{vec}\left(#1\right)}
\newcommand{\vclass}{\mathcal{G}}
\newcommand{\pclass}{\Pi}
\newcommand{\qclass}{\mathcal{Q}}
\newcommand{\rclass}{\mathcal{R}}
\newcommand{\classComplexity}[2]{N_{class}(#1,#2)}
\newcommand{\cclass}{\mathcal{F}}
\newcommand{\gclass}{\mathcal{G}}
\newcommand{\pthres}{p_{thres}}
\newcommand{\ethres}{\epsilon_{thres}}
\newcommand{\eclass}{\epsilon_{class}}
\newcommand{\states}{\mathcal{S}}
\newcommand{\lowprobstate}{\psi}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\contexts}{\mathcal{X}}
\newcommand{\edges}{\mathcal{E}}
\newcommand{\variance}{\text{Var}}
\newcommand{\params}{\vect{\theta}}
\newcommand{\sign}{\text{sign}}

\newcommand{\relu}[1]{\sigma\left(#1\right)}
\newcommand{\reluder}[1]{\sigma'\left(#1\right)}
\newcommand{\act}[1]{\sigma\left(#1\right)}
\newcommand{\kijmin}{\lambda}
\newcommand{\lambdamin}{\lambda_{\min}\left(\mat{K}^{(H)}\right)}

\newtheorem{thm}{Theorem}[section]
\newtheorem{lem}{Lemma}[section]
% \newtheorem{proof}{Proof}[section]
\newtheorem{cor}{Corollary}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{asmp}{Assumption}[section]
\newtheorem{defn}{Definition}[section]
\newtheorem{fact}{Fact}[section]
\newtheorem{conj}{Conjecture}[section]
\newtheorem{rem}{Remark}[section]
\newtheorem{example}{Example}[section]
\newtheorem{condition}{Condition}[section]

%Xiyu framework's notations
\newcommand{\gaussian}{\mathcal{P}}
\newcommand{\linfunc}{\mathcal{L}}
\newcommand{\linsub}{\mathcal{W}}
\newcommand{\detmap}{\mathcal{D}}
\newcommand{\activate}{\rho}
\newcommand{\bias}{b}
\newcommand{\error}{\mathcal{E}}
\newcommand{\wbound}{\mathfrak{W}}
\newcommand{\rhobound}{\Lambda}
\newcommand{\gaussianspace}{{\mathcal{L}^2}}
\title{ResIST: Layer-Wise Decomposition of ResNets for Distributed Training}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Chen Dun}
\author[1]{Cameron R. Wolfe}
\author[1]{Christopher M. Jermaine}
\author[1]{Anastasios Kyrillidis}

% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Rice University\\
    Houston, Texas, USA
}

  
\begin{document}
\maketitle

\begin{abstract}
We propose {\rm \texttt{ResIST}}, a novel distributed training protocol for Residual Networks (ResNets).
{\rm \texttt{ResIST}} randomly decomposes a global ResNet into several shallow sub-ResNets that are trained independently in a distributed manner for several local iterations, before having their updates synchronized and aggregated into the global model.
In the next round, new sub-ResNets are randomly generated and the process repeats until convergence.
By construction, per iteration, {\rm \texttt{ResIST}} communicates only a small portion of network parameters to each machine and never uses the full model during training.
Thus, {\rm \texttt{ResIST}} reduces the per-iteration communication, memory, and time requirements of ResNet training to only a fraction of the requirements of full-model training. 
In comparison to common protocols, like data-parallel training and data-parallel training with local SGD, {\rm \texttt{ResIST}} yields a decrease in communication and compute requirements, while being competitive with respect to model performance.
\end{abstract}


\begin{figure*}[ht]
%   \includegraphics[width=\textwidth]{REXplots/summarysgdm.png}
  \centering
\includegraphics[width=\textwidth]{images/Resnet_modified.png}
\caption{The \texttt{ResIST} model: \textbf{Row $\mathbf{(a)}$} represents the original global ResNet. 
\textbf{Row $\mathbf{(b)}$} shows the creation of two sub-ResNets. Observe that subnetwork 1 contains the residual blocks \#1, \#2 and \#4, while subnetwork 2 contains the residual blocks \#3, \#4 and \#5. 
\textbf{Row $\mathbf{(c)}$} shows the reassembly of the global ResNet, after locally training subnetworks 1 and 2 for some number of local SGD iterations; residual blocks that are common across subnetworks (e.g., residual block \#4, marked with a $\star$) are aggregated appropriately during the reassembly.}
\label{resist}
\end{figure*}


% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \mlsysEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
% \printAffiliationsAndNotice{\mlsysEqualContribution} % otherwise use the standard text.

\section{Introduction}
\textbf{Background.}
The field of Computer Vision (CV) has seen a revolution, beginning with the introduction of AlexNet during the ILSVRC2012 competition. % \citep{alexnet, imagenet}. 
Following this initial application of deep convolutional neural networks (CNNs), 
%more modern architectures were produced, thus rapidly pushing the state of the art in image recognition \citep{zfnet, googlenet, vgg}. 
%In particular, 
the introduction of the residual connection (ResNets) allowed scaling to massive depths without being crippled by issues of unstable gradients during training \citep{resnet}. 
%Such ability to train large networks was only furthered by the development of architectural advancements, like batch normalization \citep{batchnorm}. 
The capabilities of ResNets have been further expanded in recent years, but the basic ResNet architecture has remained widely-used. % \citep{resnext, preactres}.
While ResNets have become a standard building block for the advancement of CV research, % \citep{fasterrcnn, densenets, maskrcnn, retinanet}, 
the computational requirements for training them are significant.
% As compute power has become more advanced, state-of-the-art architectures in computer vision (CV) have become deeper and wider \citep{wideresnet, resnet, stochdepth}.
% Additionally, large-scale annotated vision datasets \citep{imagenet, coco} have resulted in significantly increased training times.
%However, as ResNets enable training on neural networks with massive depth and on much larger scale datasets, the computational requirements for training are significantly increased.
For example, training a ResNet50 on ImageNet with a single NVIDIA M40 GPU takes 14 days \citep{you2018imagenet}.
%The computational expense of training modern computer vision models, such as convolutional neural networks (CNNs), creates a need for efficient methods of distributed training.
%For example, the recent Vision Transformer (ViT) model \citep{16by16} requires 2.5K TPU-v3 core days for pre-training on the JFT-300M dataset, while the Big Transfer (BiT) methodology \citep{BigT} requires over 8 GPU-months of pre-training on similar-sized datasets to achieve reported results.
%Furthermore, training a standard ResNet \citep{resnet} on ImageNet can require several days of GPU time \citep{imagenetinmin, benchmarkgpu}.
%\textcolor{blue}{Tasos: Similar to the Amazon grant, can we "speak" with numbers? e.g., datasets + architectures = that many GPU hours with citations, etc etc. We can include examples that we cannot run yet, but we can conclude with the standard example of ImageNet that requires in the ResNet101 that many GPU hours.}

% what are the current approaches to distributed training?
Distributed training with multiple GPUs is commonly adopted to speed up the training process for ResNets. %distributed training of ResNet with multiple GPUs is commonly adopted. For example, data-parallel training method is the widely used approach to accelerate training of ResNet. \citep{1hrimagenet, increasebatchsize, pytorch-datadistrib, tensorflow-datadistrib}.
%\textcolor{blue}{Tasos: include specific references to Tensorflow and PyTorch modules to signify that.} 
Yet, such acceleration is achieved at the cost of a remarkably large number of GPUs (e.g 256 NVIDIA Tesla
P100 GPU in \citep{1hrimagenet}).
Additionally, frequent synchronization and high communication costs create bottlenecks that hinder such methods from achieving speedups with respect to the number of available GPUs~\citep{distrib-benchmark}.
% High communication costs of such methods hinder them from achieving the ideal, linear speedup \citep{distrib-benchmark, comm-eff-survey}.
%\textcolor{blue}{Tasos: provide examples where more compute sites result into longer training times, with citations if possible.}
Asynchronous approaches avoid the cost of synchronization, but stale updates complicate their optimization process \citep{asynch-summary}.
Other methods, such as data-parallel training with local SGD \citep{localsgdconverge, use_local_sgd, parallel_sgd, fed_avg}, reduce the frequency of synchronization.
Similarly, model-parallel training has gained in popularity by decreasing the cost of local training between synchronization rounds \citep{parallelism_survey, lamp, nonlinear_multigrid_layer_parallel, layer_parallel_resnet, xpipe, multi_gpu_model_parallel}.

% FIXED \textcolor{magenta}{Tasos: would someone complain at this point about model-parallel approaches?}

% What are current methods to reduce communication in distributed training - how is our method different?
% (DONE - chen has these tests) \textcolor{blue}{Tasos: we keep saying that but nobody has tried yet to apply the most established techniques on this. We should do it at some point. }


% what is our proposal to solve these issues?
\textbf{This paper.}
We focus on efficient distributed training of CNNs with residual skip connections.
Our proposed methodology accelerates synchronous, distributed training by leveraging ResNet robustness to layer removal \citep{stochdepth}.
In particular, a group of high-performing subnetworks (sub-ResNets) is created by partitioning the layers of a shared ResNet model to create multiple, shallower sub-ResNets.
These sub-ResNets are then trained independently (in parallel) for several iterations before aggregating their updates into the global model and beginning the next iteration.
%a methodology similar to \citep{IST} can be adopted: %\textcolor{magenta}{Do not assume that people know our work; spend 2-3 sentences to describe the idea without relying on IST; say e.g., we propose the following methodology, blah blah. Then in related works mention that the idea resembles to IST, but highlight the differences.} to train each subnetwork in an independent, distributed manner. 
%Once each shallow subnetwork has been trained for a certain number of iterations, they can be aggregated into a full, global model.
Through the local, independent training of shallow sub-ResNets, this methodology both limits synchronization and communicates fewer parameters per synchronization cycle, thus drastically reducing communication overhead.
% Such methodology limits synchronization by allowing independent training to proceed for many iterations between aggregations.
% Additionally, because only a portion of the network's parameters are partitioned to each sub-ResNet, fewer parameters are communicated per synchronization cycle, thus reducing communication overhead.
% \textit{By rethinking the framework for synchronous distributed training, our proposal addresses the issues that damage its performance most: synchronization and communication}.
We name this scheme \textit{ResNet Independent Subnetwork Training} (\texttt{ResIST}).
The contributions of this work are: 
    % give the contributions of this paper
%Some highlights of this work are: 
\begin{itemize}[leftmargin=*]
    \item We propose a distributed training scheme for ResNets, dubbed \texttt{ResIST}, that partitions the layers of a global model to multiple, shallow sub-ResNets, which are then trained independently between synchronization rounds. 
    \item We provide theory that \texttt{ResIST} (based on simple ResNet architectures) converges linearly, up to an error neighborhood, using distributed gradient descent with local iterations. We show that the behavior of \texttt{ResIST} is controlled by the overparameterization parameter $m$, as well as the number of workers $S$ in the distributed setting, the number of local iterations, as well as the depth $H$ of the ResNet architecture. Such findings reflect practical observations that are made in the experimental section. 
    \item We perform extensive ablation experiments to motivate the design choices for \texttt{ResIST}, indicating that optimal performance is achieved by $i)$ using pre-activation ResNets, $ii)$ scaling intermediate activations of the global network at inference time, $iii)$ sharing layers between sub-ResNets that are sensitive to pruning, and $iv)$ imposing a minimum depth on sub-ResNets during training.
    \item \texttt{ResIST} is shown to achieve high accuracy and time efficiency in all cases. We conduct experiments on several image classification and object detection datasets, including CIFAR10/100, ImageNet, and PascalVOC.  
    \item We utilize \texttt{ResIST} to train numerous different ResNet architectures (e.g., ResNet101, ResNet152, and ResNet200) and provide implementations for each in PyTorch \citep{pytorch}.  %\vspace{-0.3cm} 
\end{itemize}

% We conduct experiments on various image classification datasets. 
% Object detection experiments are also provided with \texttt{ResIST} on the PascalVOC dataset.
% The experiments on these datasets involve several ResNet architectures, including ResNet101, ResNet152, and ResNet200.
%\textcolor{blue}{Tasos: we should describe the models + datasets we use: e.g., take this from Demon paper. Experiments are provided on various datasets---including MNIST, FMNIST, CIFAR-10, CIFAR-100, STL-10, Tiny ImageNet, Penn Treebank (PTB); and networks---including Convolutional Networks (CNN) with Residual architecture (ResNet) (Wide ResNet), Non-Residual architecture (VGG-16) , Recurrent Neural Networks (RNN) with Long Short-Term Memory architecture (LSTM) , Variational AutoEncoders (VAE) , Capsule Network , Noise Conditional Score Network (NCSN).}
% give the layout of the paper

\section{Sub-ResNet Training} \label{methods}
\texttt{ResIST} operates by partitioning the layers of a global ResNet to different, shallower sub-ResNets, training those independently, and intermittently aggregating their updates into the global model.
The high-level process followed by \texttt{ResIST} is depicted in Fig. \ref{resist} and outlined in more detail by Algorithm \ref{alg:resist}.
\emph{We note that a naive, uniform partitioning of blocks to each subnetwork, resembling a distributed implementation of \citep{stochdepth}, performs poorly (see Figure 1 in the Appendix ).}
To improve upon this procedure, extensive design choices, outlined in section A in the Appendix, are studied to motivate \texttt{ResIST}, leading to a final methodology that generalizes well across domains and datasets.


\subsection{Model Architecture} \label{model_arch}


\begin{figure}
\centering
\includegraphics[width=1\columnwidth]{images/resnet_model.png}

\caption{The ResNet101 model used in the majority of experiments. The figure identifies the convolutional blocks that are partitioned to subnetworks. The plot depicts the pre-activation ResNet setting, where we use BN, ReLU, and Conv layers twice in sequence. The network is comprised of four major ``sections'', each containing a certain number of convolutional blocks of equal channel dimension.}
\label{model_depict}

\end{figure}

To achieve optimal performance with \texttt{ResIST}, the global model must be sufficiently deep.
Otherwise, sub-ResNets may become too shallow after partitioning, leading to poor performance.
For most experiments, a ResNet101 architecture is selected, which balances sufficient depth with reasonable computational complexity. 
Experiments with deeper architectures are provided in section A.4 in the Appendix.
%\textcolor{blue}{Tasos: We will try to include ResNet152 and ResNet200 in future experiments}.
% WE NEED TO PROVIDE EMPIRICAL EVIDENCE OF THIS BEING A GOOD CHOICE! % TODO: show the minimum number of blocks required, but also run some experiments with ResNet 20 ResNet50 etc.
%A more comprehensive, empirical analysis of subnetwork depth is available in Sec. XXX, showing that the ResNet101 is a reasonable choice for the provided experiments.

\texttt{ResIST} performs best with pre-activation ResNets \citep{preactres}.
Intuitively, applying batch normalization prior to the convolution ensures that the input distribution of remaining residual blocks will remain fixed, even when certain layers are removed from the architecture.
The Pre-activation ResNet101, which we utilize for the majority of experiments, is depicted in Fig. \ref{model_depict}.
This model, as well as deeper variants (e.g., ResNet152 and ResNet200), are readily available through deep learning packages like PyTorch \citep{pytorch} and Tensorflow \citep{tensorflow}.


\subsection{Sub-ResNet Construction} \label{subnet_sec}

% From \citep{stochdepth}, it is already known that layers can be removed from residual networks, without damaging their performance.
% However, a disjoint partitioning of layers to sub-ResNets was found to perform poorly (see Fig. \ref{resist_ablations}), showing that a naive extension of \citep{stochdepth} into the distributed setting is not sufficient for our methodology.

Pruning literature has shown that strided-, initial-, and final-layers within CNNs are sensitive to pruning \citep{filterprune}.
Additionally, repeated blocks of identical convolutions (i.e., equal channel size and spatial resolution) are less sensitive to pruning \citep{filterprune}.
Drawing upon these results, \texttt{ResIST} only partitions blocks within the third section of the ResNet (see the highlighted section in Fig. \ref{model_depict}), while all other blocks are shared between sub-ResNets.
These blocks are chosen for partitioning because $i)$ they account for the majority of layers; $ii)$ they are not strided; $iii)$ they are located within the middle of the network (i.e., initial/final layers are excluded); and $iv)$ they reside within a long chain of identical convolutions.
By partitioning these blocks, \texttt{ResIST} allows sub-ResNets to be shallower than the global model, while maintaining high performance. % within each sub-ResNet.


The process of constructing sub-ResNets follows a simple procedure; see Figure \ref{resist}.
From row $(a)$ to $(b)$ within Figure \ref{resist}, indices of partitioned layers within the global model are randomly permuted and distributed to sub-ResNets in a round-robin fashion.
Each sub-ResNet receives an equal number of convolutional blocks (e.g., see row $(b)$).
In cases, residual blocks may be simultaneously partitioned to multiple sub-ResNets to ensure sufficient depth (e.g., see $(\star)$ in Figure \ref{resist}).
% Once sub-ResNets have been independently trained, the parameters of each block are copied back into the correct location within the global model, as depicted by the transition from $(b)$ to $(c)$ in Fig. \ref{resist}.
% If a single block was assigned to multiple sub-ResNets (e.g., $(\star)$ in Fig. \ref{resist}), the parameters of this block must be averaged within the global model. 
\texttt{ResIST} produces subnetworks with $\mathcal{O}(\frac{1}{S})$ of the global model depth, where $S$ is the number of independently-trained sub-ResNets.\footnote{A fixed number of blocks is excluded from partitioning (i.e., blocks not in the third section). As a result, this approximation of $\mathcal{O}(\frac{1}{S})$ becomes more accurate as the network becomes deeper (i.e., deeper ResNet variants only add blocks to the third section), as a larger ratio of total blocks are included in the partitioning process.}
To contrast this with existing non-distributed attempts, stochastic depth networks \citep{stochdepth} have an expected depth of 75\% of the global model.

The shallow sub-ResNets created by \texttt{ResIST} accelerate training and reduce communication in comparison to methods that communicate and train the full model.
Table \ref{comm_amounts} shows the comparison of local SGD to \texttt{ResIST} with respect to the amount of data communicated during each synchronization round for different numbers of machines, highlighting the superior communication-efficiency of \texttt{ResIST}.
% DONE \textcolor{blue}{Tasos: we should refer to a table that we will include in the next sections, where we show the amount of information shared per synchronization in our method and in the data parallel case.}

%\vspace{-0.4cm}
\begin{table}[!t]
\centering
\caption{Data communicated during each communication round (in GB) of both local SGD \citep{localsgdconverge} and \texttt{ResIST} across different numbers of machines with ResNet101.}

%\begin{Large}
\begin{tabular}{cccccc}
\toprule
    Method & 2 Machine & 4 Machine & 8 Machine\\ \midrule
    Local SGD & 0.662 GB & 1.325 GB & 2.649 GB\\
    \texttt{ResIST} & \textbf{0.454} GB & \textbf{0.720} GB & \textbf{1.289} GB \\
 \bottomrule
\end{tabular}
\label{comm_amounts}
%\vspace{-0.3cm}
%\end{Large}
\end{table}


%Although such partitioning can be generalized to any number of sites, one must consider that subnetworks may become too shallow as the total number of subnetworks becomes large.
%To prevent such an issue, \texttt{ResIST} enforces a minimum subnetwork depth requirement, which can be satisfied by partitioning certain blocks to multiple subnetworks simultaneously.


%These experiments provide useful insight for choosing ResNet architectures to train with \texttt{ResIST}.
%For example, the ResNet50 only contains five residual blocks that are available for partitioning.
%Although minimum depth requirements can be imposed on each subnetwork to allow the ResNet50 to be trained with \texttt{ResIST}, this minimum depth requirement would result in each subnetwork having a similar depth to the global model.
%As a result, \texttt{ResIST} would, in this case, yield minimal speedups in comparison to data-parallel training schemes.
%The ResNet101 (i.e., the next-largest standard ResNet architecture) contains sufficient residual blocks for partitioning to two, four, and eight subnetworks.
%Additionally, because a model of this depth yields subnetworks with significantly fewer blocks in comparison to the global model, \texttt{ResIST} achieves noticeable acceleration in comparison to vanilla data-parallel training.

\subsection{Distributed Training}
The \texttt{ResIST} training procedure is outlined in Algorithm \ref{alg:resist}.
%Here, $T$ respresents the total number of independent training rounds/synchronizations, $S$ represents the number of sub-ResNets, and $\ell$ represents the number of local training iterations for each sub-ResNet.
Sub-ResNet construction (i.e., \texttt{subResNets$(\cdot)$} in Algorithm \ref{alg:resist}) follows the procedure outlined in Sec. \ref{subnet_sec}.
After constructing the sub-ResNets, they are trained independently in a distributed manner for $\ell$ iterations. % \citep{use_local_sgd}.
% (DONE) \textcolor{blue}{Tasos: we should cite local SGD here.}
% (DONE) \textcolor{blue}{Tasos: we should have a paragraph where we provide an overview of the implementation (e.g., do we run a parameter server or a distributed parameter server implementation? Any details like this should be included somewhere; if there is potential extensions, we should name them: e.g., if we implement a regular parameter server, but there is no reason to believe that we cannot implement a distributed parameter server, we should claim that.}
Following independent training, the updates from each sub-ResNet are aggregated into the global model.
Aggregation (i.e., $\text{\texttt{aggregate}}(\cdot)$ in Algorithm \ref{alg:resist}) sets each global network parameter to its average value across the sub-ResNets to which it was partitioned.
If a parameter is only partitioned to a single sub-ResNet, aggregation simplifies to copying the parameter into the global model.
After aggregation, the global model is re-partitioned randomly to create a new group of sub-ResNets, and this entire process is repeated.

\begin{algorithm}[!htp]
\centering
\caption{\textsc{ResIST} Meta Algorithm}\label{alg:resist}
\begin{algorithmic}
    \STATE \textbf{Parameters}: $T$ synchronization iterations, $S$ sub-ResNets, $\ell$ local iterations, $\mathcal{W}$ ResNet weights. %\vspace{-0.2cm}
    %\\\hrulefill
    \STATE $h(\mathcal{W})$ $\leftarrow$ randomly initialized ResNet.
    \FOR{$t = 0, \dots, T-1$}
    \STATE$\left\{h_s(W_s)\right\}_{s = 1}^S = \text{\texttt{subResNets}}(h(W), ~S)$.
    \STATE Distribute each $h_s(W_s)$ to a different worker.
    \FOR{$s = 1, \dots, S$}
    %\State $subnet$ = $subnets$[$s$]
    \STATE //~Train $h_s(W_s)$ for $\ell$ iterations using local SGD.
    \FOR{$l_t = 1, \dots, \ell$}
    \STATE $W_s=W_s-\eta \frac{\partial L}{W_s}$
    \ENDFOR
    \ENDFOR
    \STATE $h(\mathcal{W}) = \texttt{aggregate}\left(\left\{h_s(W_s)\right\}_{s = 1}^S\right)$.
\ENDFOR        
\end{algorithmic}
\end{algorithm}

% \begin{algorithm}[!htp]
% \centering
% \caption{IST Meta Algorithm}\label{alg:resist}
% \begin{algorithmic}
%     \STATE \textbf{Parameters}: $T$ synchronization iterations, $S$ subnets, $\ell$ local iterations, $x$ weights. %\vspace{-0.2cm}
%     %\\\hrulefill
%     \STATE $f(x)$ $\leftarrow$ randomly initialized network.
%     \FOR{$t = 0, \dots, T-1$}
%     \STATE$\left\{f_s(x_s)\right\}_{s = 1}^S = \text{\texttt{subnets}}(f(x), ~S)$.
%     \STATE Distribute each $f_s(x_s)$ to a different worker.
%     \FOR{$s = 1, \dots, S$}
%     %\State $subnet$ = $subnets$[$s$]
%     \STATE Train $f_s(x_s)$ for $\ell$ iterations using local SGD.
%     \ENDFOR
%     \STATE $f(x) = \texttt{aggregate}\left(\left\{f_s(x_s)\right\}_{s = 1}^S\right)$.
% \ENDFOR        
% \end{algorithmic}
% \end{algorithm}

% \textcolor{magenta}{Tasos: Have a look at this paragraph.}
% The current implementation relies on ideas of parallel (local) SGD, where a globally averaged update is computed using the Parameter Server paradigm \citep{li2014scaling} or the All-reduce communication primitive \citep{patarasuk2009bandwidth}.
% By default, these protocols suffer from significant bandwidth costs or high latency, that harms the training scalability.
% Decentralized training protocols \citep{assran2019stochastic, chen2012diffusion, lian2017can, lian2018asynchronous, nedic2009distributed} based on partial averaging can reduce the communication overhead.
% We note that the \texttt{ResIST} protocol can be combined with decentralized SGD, with a local model ResNet decomposition among the neighbors of each compute node; we consider this direction as future work.

\begin{figure}

\centering
\includegraphics[width=0.95\columnwidth]{images/decentralized3.png}

\caption{A depiction of the decentralized repartition procedure. This example partitions a ResNet with eight blocks into four different sub-ResNets. The ``blue-green-red'' squares dictate the data that lies per worker; the orange column dictates the last classification layer. As seen in the figure, each worker is responsible for only a fraction of parameters of the whole network. The whole ResNet is never fully stored, communicated or updated on a single worker.}
\label{decentralized}
%\vspace{-0.2cm}
\end{figure}

\begin{figure}[h]

    \centering
    \includegraphics[width=1\linewidth]{images/communication_budget_model_parallel.png}

    \caption{Communication efficiency of \texttt{ResIST} versus data parallelism (vanila), model parallelism (GPipe - \citep{huang2019gpipe}) and local SGD (LSGD) on CIFAR100. }
    \label{fig:baseline}

\end{figure}

\subsection{Baseline Choice}
Common baselines for distributed training are generally split into data- and model-parallelism protocols. 
Focusing on the former, the communication efficiency of \texttt{ResIST} significantly surpasses data-parallelism.
In particular, data parallel methods need to synchronize the whole model at every training iteration, while \texttt{ResIST} only needs to communicate the weights of sub-ResNets among the workers. 

% Additionally, \texttt{ResIST} is more communication efficient compared to common model parallel methods such as GPipe \citep{huang2019gpipe}. 
Typically, model parallel techniques split the model into modules (such as layers) and distribute these modules to each worker.
%E.g., GPipe splits the model into modules (such as layers) and distributes them to each worker.
At every training iteration, input data is first passed to the worker containing the network's beginning module (e.g., the first layer).
%At every training iteration, the input data is first passed to the worker containing the first module of the model (e.g., the first layer), before it gets propagated through the full model. 
Then, at each module, the worker $i)$ performs a forward pass of its module and $ii)$ sends the resulting output activation to the worker containing the next module.
% data is propagated sequentially through workers containing each of the network's modules.
% Each worker needs to perform a forward pass of its module and pass the full activation map of the last layer. 
After the last module is activated, the final loss and gradient is calculated before the backward pass is performed, where each worker receives gradient information needed for updating module weights.
% In the backward pass, each worker will receive the gradient information, based on which they will perform gradient updates.

Model-parallelism often suffers from higher communication frequency and volume, in comparison to data parallel methods, due to the significant cost of transmitting network activation maps between workers. %during the forward and backward pass. 
E.g., model parallel training of ResNets requires transmission of the full batch activation map between layers, which is more cumbersome than simply communicating network parameters.
\texttt{ResIST} is more communication efficient compared to common model parallel methods (e.g., GPipe \citep{huang2019gpipe}).
% Note that such model parallel methods could suffer from higher communication frequency and larger communication volumes at each synchronization. \textcolor{red}{why?}
% For ResNets specifically, we need to transmit the full batch activation map between layers, which has significantly larger size compared to the network's parameters. %Thus, we also consider model parall as an inappropriate baseline.

Within this work, we adopt local SGD \citep{use_local_sgd}---a strong variant of data parallel training---as our baseline.
Similar to \texttt{ResIST}, local SGD performs local training iterations on each worker between synchronizations, thus largely decreasing communication frequency and volume. 
To justify this selection, we perform a baseline comparison, which is displayed in Figure \ref{fig:baseline} and further detailed in Sections \ref{S:Implementation} and \ref{S:exp_det}. 
As shown in Figure \ref{fig:baseline}, \texttt{ResIST} is significantly more communication efficient in comparison to data-parallelism (vanilla) and model-parallelism (GPipe), thus making local SGD a more appropriate baseline.
%  as shown in Figure \ref{fig:baseline}. 
% The details of this experiment are in Sections \ref{S:Implementation} and \ref{S:exp_det}. 

% We consider the local SGD as our baseline \citep{use_local_sgd}, as it is a competitive baseline compared to \texttt{ResIST}. 
% Local SGD is a strong variant of data parallelism: similar to \texttt{ResIST}, it also adopts local training iteration on each worker before every synchronization, which largely decrease the communication frequency and total communication volume. 

% To justify this selection, we perform a baseline comparison experiment, as shown in Figure \ref{fig:baseline}. 
% The details of this experiment are in Sections \ref{S:Implementation} and \ref{S:exp_det}. 
% As shown, \texttt{ResIST} is significantly more communication efficient compared to data parallelism (vanilla) and model parallelism (GPipe). % such that they are not appropriate and competitive baseline.

\subsection{Implementation Details}
\label{S:Implementation}

\texttt{ResIST} is implemented in PyTorch \citep{pytorch}, using the NCCL communication package. 
We use basic \texttt{broadcast} and \texttt{reduce} operations for communicating blocks in the third section and \texttt{all reduce} for blocks in other sections. %between different subgroup of sub-ResNets containing the same block. The first, second and forth sections are updated using \texttt{all reduce}. 
We adopt the same communication procedure for the local SGD baseline to ensure fair comparison.  
\emph{The implementation of \texttt{ResIST} is decentralized, meaning that it does not assume a single, central parameter server.}
%\textcolor{magenta}{Tasos: I would like to discuss this part.}


As shown in Figure \ref{decentralized}, during the synchronization and repartition step following local training, each sub-ResNet will directly send each of its locally-updated blocks to the designated new sub-ResNet. % (i.e., the parameters are not sent to an intermediate parameter server).
Each worker will only need sufficient memory to store a single sub-ResNet, thus limiting the memory requirements.
% At all time steps, each worker will only need the memory size of the single subnetwork.
Such a decentralized implementation allows parallel communication between sub-ResNets, which leads to further speedups by preventing any single machine from causing slow-downs due to communication bottlenecks. % in the distributed procedure.
The implementation is easily scalable to eight or more machines, either on nodes with multiple GPUs or across distributed nodes with dedicated GPUs. % as it is possible for the training to be distributed across multiple GPUs on a single node, as well as across several compute nodes.

%\emph{This work is focused on the algorithmic level of distributed ResNet training.}
\texttt{ResIST} reduces the number of bits communicated at each synchronization round and accelerates local training with the use of shallow sub-ResNets.
The authors are well-aware of many highly-optimized versions of data-parallel and synchronous training methodologies \citep{pytorch, tensorflow, sergeev2018horovod}. 
\texttt{ResIST} is fully compatible with these frameworks and can be further accelerated by leveraging highly-optimized distributed communication protocols at the systems level, which we leave as future work. % as it only uses basic general \emph{broadcast} and \emph{reduce} operations. 
%Therefore, in comparison to similarly-implemented versions of synchronous methodologies, \texttt{ResIST} yields a massive acceleration.
% We leave this as future work.
Further, the authors are well-aware of advanced recent decentralized distributed computing techniques as in \citep{koloskova2020unified, nedic2009distributed, assran2020asynchronous, koloskova2019decentralized}; %\citep{koloskova2020unified, nedic2009distributed, johansson2010randomized, lian2017can, tang2018d, assran2020asynchronous, koloskova2019decentralized}; 
our aim is to show the benefits of our approach even on simpler distributed frameworks, and we leave the extension of \texttt{ResIST} to such more advanced protocols as future work.

\subsection{Supplemental Techniques}
\label{S:supp_tech}
% \begin{table}[!htp]
% \centering
% \caption{Test accuracy on CIFAR10 and CIFAR100 for ResNet-101 trained with \texttt{ResIST} and different numbers of local iterations. Each test was performed with four learning rates at different magnitudes, and the best-performing result is reported. $\infty$ local iterations refers to aggregating parameters only once at the end of training (i.e., single-shot averaging).}
% \begin{tabular}{ccccc}
% \toprule
%     & \multicolumn{2}{c}{CIFAR10} & \multicolumn{2}{c}{CIFAR100} \\
%     \midrule
%     & \multicolumn{2}{c}{\# Sub-ResNets} & \multicolumn{2}{c}{\# Sub-ResNets}\\
%     \# Local Iter. & 2 & 4 & 2 & 4\\
%      \midrule
%      50 & 92.23 & 90.60 & 70.79 & 67.95 \\
%      100 & 92.33 & 90.35 & 71.10 & 67.90 \\
%      250 & 92.21 & 90.50 & 70.78 & 67.35\\
%      500 & 92.33 & 90.26 & 71.88 & 67.24\\
%      1000 & 92.31 & 90.38 & 71.55 & 66.92 \\
%      1500 & 92.16 & 89.95 & 71.71 & 66.62 \\
%      2000  & 92.24 & 89.60 & 71.26 & 65.57\\
%      4000 & 91.77 & 85.97 & 69.08 & 59.57 \\
%      6000 & 90.23 & 81.71 & 67.20 & 53.14 \\
%      $\infty$ & 70.65 & 61.86 & 31.34 & 22.10 \\
%  \bottomrule
% \end{tabular}
% \label{local_iter_resist}
% \end{table}


\textbf{Scaling Activations.}
%The \texttt{ResIST} algorithm returns a full-depth model, but conducts training with shallow ResNets.
Similar to \citep{stochdepth}, activations must be scaled appropriately to account for the full depth of the resulting network at test time.
To handle this, the output of residual blocks in the third section of the network (see Figure \ref{model_depict}) is scaled by $1/S$, where $S$ is the number of sub-ResNets.
Such scaling allows the global model to perform well, despite using all layers at test time. 
%\textcolor{blue}{Tasos: is there a citation that we refer to that backs up this scaling? If not, no problem.}

\noindent
\textbf{Subnetwork Depth.}
Within \texttt{ResIST}, sub-ResNets may become too shallow as the number of sub-ResNets increases. % (i.e., assuming blocks are partitioned disjointly and the global model depth is fixed). 
To solve this issue, \texttt{ResIST} enforces a minimum depth requirement, which is satisfied by sharing certain blocks between multiple sub-ResNets.
Through experimental analysis, a minimum of five blocks partitioned to each sub-ResNet was found to perform optimally.
Such a finding motivates our choice of the ResNet101 architecture, as ResNet50 contains only five blocks for partitioning.
%Therefore, ResNet101 is the minimum-size architecture for \texttt{ResIST} in which sub-ResNets are significantly shallower than the global model, allowing for noticeable acceleration.
\texttt{ResIST} is extensible to deeper architectures; see section A.4 in the Appendix.

\noindent
\textbf{Tuning Local Iterations.}
%The number of local iterations is a hyperparameter that must be tuned in \texttt{ResIST} (i.e., see $\ell$ in Algorithm \ref{alg:resist}).
We use a default value of $\ell=50$, as $\ell<50$ did not noticeably improve performance.
In some cases, the performance of \texttt{ResIST} can be improved by tuning $\ell$ (see Figure 2 in Appendix).
% E.g., \texttt{ResIST} performance boosts by increasing $\ell$ in certain cases (see Fig. \ref{fig:local_iter}).
The optimal $\ell$ setting in \texttt{ResIST} is further explored in section A.3 in the Appendix.

\noindent
\textbf{Local SGD Warm-up Phase.}
Directly applying \texttt{ResIST} may harm performance on some large-scale datasets (e.g., ImageNet).
To resolve this, we perform a few epochs with data parallel local SGD before training the model with \texttt{ResIST}.\footnote{Activations of blocks within $3^{\text{rd}}$ section are still scaled during local SGD pre-training to maintain consistency with \texttt{ResIST}.}
By simply pre-training a model for a few epochs with local SGD, the remainder of training is completed using \texttt{ResIST} without a significant performance decrease.
% As observed in expeirments on large scale dataset (eg. Imagenet), directly applying \texttt{ResIST} will harm the final performance. This might be due to additional training instability caused by \texttt{ResIST}. To solve this issue, we propose to use few epochs of data parallel local sgd training of full model as pretraining for the following \texttt{ResIST}. We also scale the activates of full model during pretraining with the same scaling constant with \texttt{ResIST} to maintain consistent. We initialize the weight, learning rate and batch statistic of each subnetwork in \texttt{ResIST} with pretrained network and finish the remaining epochs.
%\textcolor{blue}{Tasos: add citations} 
%\textcolor{blue}{Tasos: we should visit these local SGD papers and see what they propose in their scenarios. We have to be clear that we have looked into various values and we have chosen this one.}

% \noindent
% \textbf{Maintaining Momentum Buffers.}
% We use the SGD optimizer with momentum for all experiments.\footnote{Experiments with different optimizers, such as Adam \citep{adam}, AdamW \citep{adamw}, or Demon \citep{demon}, do not show any improvements.}
% %\textcolor{blue}{Tasos: this subsubsection needs to be better explained. The momentum here I assume is the regular momentum right? This means that we use SGD+M as the optimizer? If yes, it would be great to show with a small plot what is the buffer and what are the issues.}
% %\textcolor{blue}{Tasos: which optimizer do we use and why? Have we checked other optimizers? Would Demon work here?}\
% %In our initial implementation of \texttt{ResIST}, the optimizer for each sub-ResNet, including the entire momentum buffer, was newly initialized after each synchronization round.
% %As a result, the momentum buffers only reflected a limited window of training history, which diminishes the benefits of momentum.
% %To eliminate this issue, a
% A global memory of the momentum buffer for each layer can be stored so that the buffer is maintained between synchronizations.
% This improves performance, but results in increased communication costs (i.e., the momentum buffer must be communicated between machines).
% As a result, we did not include this change within the final implementation of \texttt{ResIST}.


\section{Theoretical Result}

We provide proof that the gradient descent direction of combined updates from all sub-ResNets, during distributed local training, is close to the hypothetical gradient descent direction of the whole model as if trained centrally.     

\begin{thm}[Convergence Rate of Gradient Descent for \texttt{ResIST}]\label{thm:resist_gd}
Assume there are $S$ workers, $\ell$ local and $T$ global steps. Assume the depth of the whole ResNet is $H$. Assume for all data indices $i \in [n]$, the data input satisfies $\norm{\vect{x}_i}_2 = 1$, the data output satisfies $\abs{y_i} = O(1)$, and the number of hidden nodes per layer satisfies $m = $ 
\begin{align}
\Omega\bigg(\max\bigg\{\tfrac{n^4 }{\lambda_{\min}^4\left(\mat{K}^{(H)}\right)H^6},\tfrac{n^2 }{\lambda_{\min}^2(\mat{K}^{(H)})H^2},\tfrac{n}{\delta}, \tfrac{n^2\log\left(\tfrac{Hn}{\delta}\right)}{\lambda_{\min}^2\left(\mat{K}^{(H)}\right)} \bigg\}\bigg).\nonumber
\end{align}
Set the step size $\eta = O\bigg(\tfrac{\lambdamin H^2 }{n^2 \ell^2 S}\bigg)$ in gradient descent in local training iteration, and follow the procedure as in Algorithm 1. 
Let the squared-norm loss be $L(\theta(t)) := \tfrac{1}{2} \|\vect{y} - f(\theta(t))\|_2^2$, per $t$ global synchronization round, $t=1,2,\ldots T$; here, $\vect{y}$ corresponds to the data ``labels'', and $\theta(t)$ and $f(\theta(t))$ represent the parameters and the output of the whole ResNet, respectively, after $t$-global rounds of \texttt{ResIST}.
Here, $\theta$ includes weights $\mat{W}^{(h)}$ at depth $h$ and the last layer's weights $\mat{a}$.
Then, with probability at least $1-\delta$ over the random initialization, we have:
\begin{align*}
L(\params(t))\le \left(1-\tfrac{\eta \ell \lambdamin}{2}\right)^{t} \cdot L(\params(0)).
\end{align*}

\end{thm}

First, some definitions; more details in the Appendix.
Similar to \citep{du2019gradient}, $\mat{K}^{(H)} \in \mathbb{R}^{n \times n}$ 
is a fixed matrix that depends on the input data, neural network architecture and the activation but does not depend on neural network parameters.
Next, we present our method of proving this global result on \texttt{ResIST}. 
Our proof technique is inspired by \citep{du2019gradient}: 
Let the prediction of the network at some $k$-th iteration be $\vect{u}(k)=f(\theta(k))$.\footnote{We use $k$ to abstract the notion of an iteration in \citep{du2019gradient}; in our case, a different analysis includes two different iteration indices, $\ell$ and $t$.}
We formulate the training dynamics as: 
\begin{align*}
\vect{y}-\vect{u}(k+1) = (\vect{I}-\eta \mat{G}(k))(\vect{y}-\vect{u}(k)), 
\end{align*}
where $\mat{G}_{ij}(k) = \inner{\frac{\partial u_i(k)}{\partial \params(k)},\frac{\partial u_j(k)}{\partial \params(k)}} = $ 
\begin{align*}
&\sum_{h=1}^{H}\inner{\tfrac{\partial u_i(k)}{\partial \mat{W}^{(h)}(k)},\tfrac{\partial u_j(k)}{\partial \mat{W}^{(h)}(k)}} + \inner{\tfrac{\partial u_i(k)}{\partial \vect{a}(k)},\tfrac{\partial u_j(k)}{\partial \vect{a}(k)}} \\
\triangleq & \sum_{h=1}^{H+1}\mat{G}^{(h)}_{ij}(k).
\end{align*}
The proof in \citep{du2019gradient} obeys the following ideas: when the width $m$ of deep ResNet is sufficiently large, $\mat{G}^{(H)}(k)$ will be very close to $\mat{G}^{(H)}(0)$, and all of $\mat{G}^{(H)}(k)$'s will be close to the fixed population gram matrix $\mat{K}^{(H)}$. The exact definition of $\mat{K}^{(H)}$ for ResNet can be found in Section 6 of \cite{du2019gradient}. Further, $\lambda_{\min} (\mat{G}^{(H)}(0))$ is larger than 0. Thus, by standard matrix perturbation analysis, it is shown that $\lambda_{\min}(\mat{G}^{(H)}(0))$ is also strictly positive, which will result in linear convergence of deep ResNet.

Here, we further generalize such technique to distributed \texttt{ResIST} with layer dropout.
The novelty of our proof is that we only conduct gradient descent on sub-ResNets assigned to each local worker. 
\emph{There is no training iteration with the whole model: this includes the generation of random masks that ``champion'' parts of the whole ResNet model per worker.} Handling such constructions is the gist of this proof:
We carefully analyze the convergence of each subnetwork during local training iterations $\ell$, and prove the global convergence of the combined whole model throughout synchronization rounds $t$. 
The full proof is provided in section B in the Appendix.


\section{Related Work} \label{related_work}

% % ResNet models and related literature, especially preactivation resnet!  and shallow resnet approximate deep resnet paper
% Convolutional neural networks (CNNs) were popularized for CV tasks, starting with the proposal of AlexNet \citep{alexnet}, prior to the proposal of the ResNet architecture \citep{resnet}. 
% ResNets quickly became a standard CNN architecture and are widely used . 

Following ResNet, 
%\citep{fasterrcnn, densenets, maskrcnn, retinanet}, 
most novel architectures continued to leverage residual connections, %\citep{mobilenetv2, inceptionresnet},
which became standard in most architectures. %\citep{transformer, bert, rezero}. 
The ResNet architecture has been further modified. %\citep{preactres, resnext, wideresnet}. 
\emph{This work focuses on the pre-activation ResNet variant \citep{preactres}, as it achieves high performance and is well-suited to layer-wise decomposition.}

% possibly go over some other methods of making training faster
%The available literature on distributed training techniques for deep learning is vast. 
The focus of this study is on synchronous methods of distributed optimization, such as data parallel training, parallel SGD \citep{parallelsgd}, or local SGD \citep{localsgdconverge}.
Our methodology is also a variant of model-parallel training \citep{parallelism_survey, lamp,  nonlinear_multigrid_layer_parallel, layer_parallel_resnet, xpipe, multi_gpu_model_parallel}.
%\footnote{\textcolor{blue}{Tasos: Again, we should refer to some model-based approaches. E.g., the description of the grant i shared has some links to model parallel approaches.}}
%\textcolor{magenta}{Maybe we can mention thee case of model parallel.}
Many studies have explored possible techniques of synchronous, distributed optimization, yielding a wide number of viable variants \citep{use_local_sgd}. %, as well as techniques for MLP decomposition \citep{IST}. %\footnote{Our method applies on residual blocks, not an obvious extension of \citep{IST}, and $i)$ can be used on larger architectures; $ii)$ uses a layer-wise decomposition of the global model; $iii)$ selectively partitions network parameters, and $iv)$ allows parameter overlap, so that subnetworks never become too shallow.}

To reduce communication costs in the distributed setting, both quantization \citep{commeff-sgd, comm-comp} and sparsification \citep{sparse-comm, linear-speed-quant, grad-sparse, adaptive_fed_drop} methods have been explored.
%These methods aim to either reduce the precision of communicated updates or sparsify the updates at certain coordinates.
%\textcolor{blue}{Tasos: we should mention the variants where only uplink is compressed (gradient updates), or both uplink and downlink are compressed (both model sharing to workers and gradient updates are compressed.}
%Both approaches have been shown to successfully reduce communication overhead.
Similarly, other studies have achieved speedups through the use of low-precision arithmetic during training \citep{4minimagenet}.
%\textcolor{blue}{Tasos: we should close this paragraph by stating again that this line of work is orthogonal to our proposal, and can be combined with our ideas.}
However, \emph{this line of work is orthogonal to our proposal and can be easily combined with the provided methodology; see section A.5 in the Appendix.}  

% \begin{table*}[!htp]
% \centering
% \caption{Performance of baseline models and models trained with \texttt{ResIST}. Results for image classification datasets are in terms of test accuracy, while object detection performance is reported as test loss.} %\vspace{0.1cm}
% \begin{tabular}{cccccc}
% \toprule
%     & \# Machines & CIFAR10 & CIFAR100 & SVHN & Pascal VOC\\ \midrule
%     Local SGD & 2 & 92.36\% $\pm$ 0.01 & 70.67\% $\pm$ 0.03 & 96.25\% $\pm$ 0.05 & 6.15 $\pm$ 0.03 \\
%     & 4 & 92.90\% $\pm$ 0.06 & 71.51\% $\pm$ 0.04 & 96.36\% $\pm$ 0.01 &  6.22 $\pm$ 0.06 \\
%     & 8 & 92.00\% $\pm$ 0.07 & 69.64\% $\pm$ 0.05 & 96.24\% $\pm$ 0.03 &  - \\
%     \midrule
%     \texttt{ResIST} & 2 & 91.95\% $\pm$ 0.32 & 70.06\% $\pm$ 0.51 & 96.26\% $\pm$ 0.04 & 5.99 $\pm$ 0.01 \\
%     & 4 & 92.35\% $\pm$ 0.22 & 71.30\% $\pm$ 0.20 & 96.26\% $\pm$ 0.01 & 6.69 $\pm$ 0.17 \\
%     & 8 & 91.45\% $\pm$ 0.30 & 70.26\% $\pm$ 0.21 & 95.87\% $\pm$ 0.01 &  - \\
%  \bottomrule
% \end{tabular}
% \label{cifar10_results}
% %\vspace{-0.3cm}
% \end{table*}


Large batch training is used to amortize communication and increase throughput for distributed training \citep{1hrimagenet, 76minbert}.
%applied large batch training methods to significantly decrease wall clock time to convergence for large image and language models. 
The properties of large batch training have since been studied extensively \citep{15minimagenet, 32Kbatch, you2018imagenet}. 
Large batches alter training dynamics, warranting the use of complex heuristics to maintain comparable performance \citep{32Kbatch}.
Here, \emph{we do not focus on the extension of \texttt{ResIST} to the large-batch training domain.
Rather, we consider this as future work. }
%FIXED -- \textcolor{magenta}{What is our comment on large batch training? Would this lead to questions to reviewers?}


% IST paper - how are we different?
% Independent subnetwork training (IST) \citep{IST} is a recent method of synchronous, distributed training. 
% Using a node-wise partition, IST decomposes MLPs into groups of \emph{disjoint} subnetworks. 
% These subnetworks are optimized independently, and their updates are intermittently aggregated into the global model.
% %A new group of subnetworks is re-partitioned after each aggregation round.
% IST is related to our work, as it similarly reduces the communication and synchronization costs of distributed training.
% However, our method applies on residual blocks (a case that is not straightforwardly implied by \citep{IST}) and $i)$ can be used on larger architectures; $ii)$ uses a layer-wise decomposition of the global model; $iii)$ selectively partitions network parameters, and $iv)$ allows parameter overlap, so that subnetworks never become too shallow.
% \begin{table*}[!htp]
% \centering
% \caption{Performance of baseline models and models trained with \texttt{ResIST}. Results for image classification datasets are in terms of test accuracy, while object detection performance is reported as test loss.} \vspace{0.1cm}
% \begin{tabular}{cc|ccccc}
% \toprule
%     & \# Machines & CIFAR10 & CIFAR100 & SVHN & ImageNet & Pascal VOC\\ \midrule
%     Local SGD & 2 & 92.37\% 92.35\% 92.40\% & 70.64\% 70.70\% 70.83\% & 96.25\% 96.30\% 96.30\% & - & 6.15 $\pm$ 0.03 \\
%     & 4 & 91.66\% $\pm$ 0.12 & 68.39\% $\pm$ 0.13 & 96.36\% $\pm$ 0.01 & - & 6.28 \\
%     \midrule
%     \texttt{ResIST} & 2 & 92.32\% $\pm$ 0.10  91.99\% 91.54\% & 70.77\% 69.59 \% 69.81\% & 96.22 \% 96.25\% 96.32\% & - & 5.99 $\pm$ 0.01 \\
%     & 4 & 90.65\% 92.12\% 92.57\% $\pm$ 0.10 & 68.07\% $\pm$ 0.04 71.51\% 71.04\% & 96.18\% $\pm$ 0.05 & - & 6.91 \\
%     & 8 & 91.29\% $\pm$ 0.12 & 70.18\% $\pm$ 0.21 & 95.88\% 95.86\% & - & - \\
%  \bottomrule
% \end{tabular}
% \label{cifar10_results}
% \end{table*}

% stochastic depth resnet -- really emphasize difference
% filter pruning paper also shows that earlier layers shouldn't be pruned
ResNet robustness to layer removal was explored in \citep{stochdepth}, while \citep{shallowensemble} showed that ensembles of shallow ResNets can yield high performance.  
\citep{stochdepth} uses shallow networks during training and scales activations so that all layers may be used for inference. 
However, our approach is distinct in numerous ways.
Primarily, \emph{our method partitions blocks in a stochastic, round-robin fashion, which explicitly prevents the exclusion of layers from training rounds and yields reduced subnetwork depth compared to \citep{stochdepth}.}
%Primarily, \citep{stochdepth} assigns survival probabilities to each layer within the ResNet and samples from a Bernoulli distribution to determine which layers should be removed. 
%In contrast, the proposed approach only partitions certain residual blocks to subnetworks (i.e., all others are shared) and allows some residual blocks to be partitioned simultaneously to multiple subnetworks. 
Inspired by \citep{filterprune}, we also selectively partition residual blocks that are least sensitive to pruning, allowing other layers (i.e., ~30\% of total layers) to be shared between subnetworks.
Unlike \citep{stochdepth}, we avoid partitioning strided layers, which are sensitive to pruning \citep{filterprune}.
Furthermore, our methodology, instead of proposing a form of regularization, focuses on utilizing independent training of shallow sub-ResNets for efficient, distributed training. % - an idea not explored by previous work.

% neural ODE stuff; MF resnet formulation
Our approach also relates to neural ODE literature.
This research connects ResNets as a discrete approximation to a continuous transformation from input to output \citep{beyondfinite}. 
The neural ODE perspective has been studied both empirically \citep{neuralode, augmentedneuralode, beyondfinite} and theoretically \citep{mfresnet, deeplimit}. 
\emph{This provides justification to our approach, as removing ResNet layers can be viewed as approximating the same transformation with a coarser discretization.}

\section{Experimental Details}
\label{S:exp_det}
%\vspace{-0.2cm}
%\texttt{ResIST} is tested empirically across multiple computer vision tasks and datasets.
Hyperparameters are tuned using a holdout validation set and results are obtained using optimal hyperparameters from the validation set. 
All experiments are repeated for three trials, and the average performance is presented. 
We adopt local SGD as our baseline for synchronous, distributed training methods. %, as it is argued to be superior in comparison to vanilla data-parallel training \citep{use_local_sgd}; see Sec. \ref{S:Implementation} for more details.
%We evaluate the proposed training methodology based on model performance and speed.
\emph{In all cases, \texttt{ResIST} achieves comparable performance to local SGD, while lowering the total wall-clock time of training.}
We use AWS p3.8xlarge instances for experiments with two or four machines\footnote{In section A.4-Appendix and for the Pacal VOC experiment with two machines, we use a cluster with eight V100 GPUs.} and p3.16xlarge instances for experiments with eight machines. 
We use each GPU as a single worker that hosts a different sub-ResNet. We assign each worker with a different random seed, so that at each training iteration, it will sample different batches of data.
% For Sec. \ref{S:deep_arch}, we are using a local cluster with eight 32GB V100.

\noindent
\textbf{Small-Scale Image Classification.}
Models are trained with \texttt{ResIST} on CIFAR10 and CIFAR100 for image classification.
We adopt standard data augmentation techniques during training and testing \citep{resnet}.
%\textcolor{magenta}{For small-scale datasets, w
We adopt a batch size of 128 for each worker.
%, while for ImageNet a batch size of 32.\footnote{For ImageNet, images are also randomly-cropped to a size of $224\times224$ during training, and center-cropped to a size of $256\times256$ during evaluation.}}
Training is conducted for 80 epochs for experiments with two machines and 160 epochs for experiments with four or eight machines.
% For all datasets, training is conducted for 80 epochs on 2-GPU model experiments and 160 epochs on 4- and 8-GPU model experiments.
The recorded performance reflects the best test accuracy achieved throughout training, averaged across three trials.
The total wall-clock training time is also reported for each experiment.
%Tests are performed using two, four, and eight sub-ResNets for \texttt{ResIST} and localSGD.

\noindent
\textbf{ImageNet Classification.}
Models are trained with \texttt{ResIST} on the 1,000-class ILSVRC2012 image classification dataset \citep{imagenet}.
We adopt standard data augmentation techniques during training and testing, and use a batch size of 256 for each worker  \citep{resnet}.
Training is conducted for 90 epochs.
We initialize the learning rate to 0.1 and decrease it $10\times$ at epochs 30 and 60.
% The learning rate is initialized as 0.1 and decreased by $10\times$ at epochs 30, 60 and 90.
% Because ImageNet is a largeAs mentioned in Sec. \ref{S:supp_tech}, because Imagenet is a large scale and relative difficult dataset,
For all experiments, we set $\ell=15$, adopt a minimum depth of 10 blocks for each sub-ResNet, and warm-up pre-training using local SGD. %(3200 synchronizations of pretraining for 2-GPU model and 2000 synchronizations of pretraining for 4-GPU model).
For both \texttt{ResIST} and baseline experiments, we utilize momentum restarts and aggregate batch statistics every 1300 synchronization rounds.
% We also use momentum restart and synchronize batch statistic every 1300 synchronizations for both \texttt{ResIST} and baseline.   

\iffalse
\begin{table*}[!htp]
\centering
\caption{Performance of baseline models and models trained with \texttt{ResIST}. Results for image classification datasets are in terms of test accuracy, while object detection performance is reported as test loss.} %\vspace{0.1cm}
\begin{tabular}{cccccc}
\toprule
    & \# Machines & CIFAR10 & CIFAR100 & SVHN & Pascal VOC\\ \midrule
    Local SGD & 2 & 92.36\% $\pm$ 0.01 & 70.67\% $\pm$ 0.03 & 96.25\% $\pm$ 0.05 & 6.15 $\pm$ 0.03 \\
    & 4 & 91.66\% $\pm$ 0.12 & 68.39\% $\pm$ 0.13 & 96.36\% $\pm$ 0.01 &  6.28 \\
    \midrule
    \texttt{ResIST} & 2 & 91.95\% $\pm$ 0.32 & 70.06\% $\pm$ 0.51 & 96.26\% $\pm$ 0.04 & 5.99 $\pm$ 0.01 \\
    & 4 & 91.78\% $\pm$ 0.82 & 71.28\% $\pm$ 0.24 & 96.18\% $\pm$ 0.05 &  6.69 $\pm$ 0.17 \\
    & 8 & 91.29\% $\pm$ 0.12 & 70.18\% $\pm$ 0.21 & 95.87\% $\pm$ 0.01 &  - \\
 \bottomrule
\end{tabular}
\label{cifar10_results}
%\vspace{-0.5cm}
\end{table*}
\fi

\noindent
\textbf{Object Detection.}
\texttt{ResIST} is tested in the object detection domain on the Pascal VOC dataset \citep{pascalvoc}. %to demonstrate the ability of \texttt{ResIST} to train ResNets on more complex tasks.
Our model, inspired by the Yolo-v2 object detection model \citep{yolov2}, consists of a ResNet101 backbone followed by a detection layer (i.e., a $1 \times 1$ convolution that outputs anchor box predictions).
The ResNet backbone of this model is similar to the classification model described in Sec. \ref{model_arch}, but without the pre-activation structure.
%However, we do not use a pre-activation architecture because the ResNet weights must be pre-trained on \textcolor{magenta}{ImageNet} \citep{imagenet} for the model to converge, and pre-trained weights for pre-activation ResNets are not yet available in PyTorch \citep{pytorch}. 
%\footnote{Because pre-activation architectures yield significant performance improvements for image classification (see Sec. \ref{model_arch}), we hypothesize that the ability to use pre-activation ResNet architectures for object detection would similarly improve the performance of \texttt{ResIST}.}
The model is trained for 100 epochs with an image dimension of $448\times448$ and batch size of 10.
No data augmentation techniques are used.
The learning rate is increased from $10^{-5}$ to $10^{-4}$ over the first 30 epochs, and decreased by $10\times$ at epochs 60 and 90.
Both Pascal VOC 2007 and 2012 training sets are used during training, and performance is evaluated on the Pascal VOC 2007 test set.
We report the wall-clock training time and the best loss achieved on the test set throughout training.
Experiments are conducted on two and four machines using both local SGD and \texttt{ResIST}.
%Aside from the differences highlighted above, the object detection experiments follow the \texttt{ResIST} methodology as outlined in Sec. \ref{methods}.


% \subsection{Training Speed} \label{speed_results}
% Here, we need to provide concrete results regarding the wall-clock time of training the proposed algorithm.

% \subsubsection{Node Distributed Domain}

% \begin{figure}
% \centering
% \begin{subfigure}
%   \centering
%   \includegraphics[width=\linewidth]{images/c10_comm_sim.pdf}
% \end{subfigure}

% \begin{subfigure}
%   \centering
%   \includegraphics[width=\linewidth]{images/c100_comm_sim.pdf}
% \end{subfigure}
% \caption{Wall clock time to convergence of ResIST and LSGD in the high communication cost scenario.}
% \label{fig:high_comm}
% \end{figure}

% Because ResIST limits communication between nodes, it further outperforms synchronous, distributed baseline methods, such as local SGD, in scenarios where communication is more expensive.
% In some cases, for example, one does not have access to multiple GPUs on a single machine, but rather multiple GPUs distributed across several nodes.
% In such a case, communication between GPUs becomes significantly more expensive, allowing the benefit of communication efficiency to become more visible.
% To simulate this environment, the communication time of ResIST and local SGD are dynamically increased.
% In particular, the duration of each communication operation during training of both methods is measured and the communication cost is synthetically increased by a constant factor of the duration.
% As can be seen in Fig. \ref{fig:high_comm}, as communication becomes more expensive, a larger difference in wall clock time to convergence between ResIST and local SGD arises, thus demonstrating that ResIST continues to improve over the speed of local SGD in domains with higher communication cost.

% \begin{table*}[!htp]
% \centering
% %\vspace{-0.4cm}
% \caption{Performance of baseline LocalSGD versus \texttt{ResIST}. Results for image classification datasets are in terms of test accuracy, while object detection performance is reported as test loss.} %\vspace{0.1cm}
% \begin{tabular}{cccccc}
% \toprule
%     & \# Machines & CIFAR10 & CIFAR100 & SVHN & Pascal VOC\\ \midrule
%     Local SGD & 2 & 92.36\% $\pm$ 0.01 & 70.67\% $\pm$ 0.03 & 96.25\% $\pm$ 0.05 & 6.15 $\pm$ 0.03 \\
%     & 4 & 92.90\% $\pm$ 0.06 & 71.51\% $\pm$ 0.04 & 96.36\% $\pm$ 0.01 &  6.22 $\pm$ 0.06 \\
%     & 8 & 92.00\% $\pm$ 0.07 & 69.64\% $\pm$ 0.05 & 96.24\% $\pm$ 0.03 &  - \\
%     \midrule
%     \texttt{ResIST} & 2 & 91.95\% $\pm$ 0.32 & 70.06\% $\pm$ 0.51 & 96.26\% $\pm$ 0.04 & 5.99 $\pm$ 0.01 \\
%     & 4 & 92.35\% $\pm$ 0.22 & 71.30\% $\pm$ 0.20 & 96.26\% $\pm$ 0.01 & 6.69 $\pm$ 0.17 \\
%     & 8 & 91.45\% $\pm$ 0.30 & 70.26\% $\pm$ 0.21 & 95.87\% $\pm$ 0.01 &  - \\
%  \bottomrule
% \end{tabular}
% \label{cifar10_results}
% %\vspace{-0.3cm}
% \end{table*}

\begin{table}[!htp]
\centering
%\vspace{-0.4cm}
\caption{Test accuracy of baseline LocalSGD versus \texttt{ResIST} on small-scale image classification datasets.} %\vspace{0.1cm}
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{small}
\begin{tabular}{cccc}
\toprule
    & \# Machines & CIFAR10 & CIFAR100 \\ \midrule
    Local SGD & 2 & 92.36\% $\pm$ 0.01 & 70.67\% $\pm$ 0.03  \\
    & 4 & 92.90\% $\pm$ 0.06 & 71.51\% $\pm$ 0.04  \\
    & 8 & 92.00\% $\pm$ 0.07 & 69.64\% $\pm$ 0.05  \\
    \midrule
    \texttt{ResIST} & 2 & 91.95\% $\pm$ 0.32 & 70.06\% $\pm$ 0.51  \\
    & 4 & 92.35\% $\pm$ 0.22 & 71.30\% $\pm$ 0.20  \\
    & 8 & 91.45\% $\pm$ 0.30 & 70.26\% $\pm$ 0.21  \\
 \bottomrule
\end{tabular}
\end{small}
\label{cifar10_results}

\end{table}

%\section{Ablations and Analysis} \label{ablation}

\section{Results}

\subsection{Small-Scale Image Classification}

\begin{table*}[!htp]
\centering
\caption{Performance of baseline models and models trained with \texttt{ResIST} on 1K Imagenet \citep{recht2019imagenet}. MF stands for test set ``MatchedFrequency'' and was sampled to match the MTurk selection frequency distribution of the original ImageNet validation set; T-0.7 stands for test set ``Threshold0.7'' and was built by sampling ten images for each class among the candidates with selection frequency at least 0.7; TI stands for test set ``TopImages'' and contains the ten images with highest selection frequency for each class.} 
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{small}
\begin{tabular}{cccccccccc}
\toprule
    & \multirow{2}{*}{\# Machines} & \multirow{2}{*}{Imagenet} &  \multicolumn{3}{c}{Imagenet V2 Test Set}  & \multirow{2}{*}{Training Time} & \multirow{2}{*}{Speedup} & \multirow{2}{*}{Communication} & \multirow{2}{*}{Cost Ratio}\\ 
    & & & MF & T-0.7 & TI & & & & \\ \midrule
    Local SGD & 2 & 73.32\% & 60.72\% & 69.47\% & 75.48\% & 48.61 hours & - & 7546.80 GB & -\\
    & 4 & 72.66\% & 59.88\% & 68.34\% & 74.27\% & 29.29 hours & - & 7546.80 GB & -  \\
    \midrule
    \texttt{ResIST} & 2 & 71.60\% & 58.92\% & 67.51\% & 73.56\% & 36.79 hours &  \textbf{1.32$\times$} & 5831.2 GB & \textbf{1.29$\times$}\\
    & 4 & 70.74\% &57.56\% & 66.46\% & 72.65\% & 22.37 hours &  \textbf{1.31$\times$} & 6007.6 GB & \textbf{1.26$\times$}\\
 \bottomrule
\end{tabular}
\end{small}
\label{imagenet_results}
%\vspace{-0.3cm}
\end{table*}

%In this section, we present the results of experiments conducted with both \texttt{ResIST} and local SGD.
%For all experiments, model performance is measured in terms of test accuracy and total training time.
%\texttt{ResIST} is shown to significantly accelerate training in comparison to local SGD without any degredation in model accuracy.

\noindent
\textbf{Accuracy.}
The test accuracy on small-scale image classification datasets is listed in Table \ref{cifar10_results}.
\emph{\texttt{ResIST} achieves comparable test accuracy in all cases where the same number of machines are used.}
\texttt{ResIST} outperforms localSGD on CIFAR100 experiments with eight machines.
% \texttt{ResIST} does outperform local SGD for eight machines on CIFAR100 dataset and slightly outperform four machines on the SVHN dataset, but \texttt{ResIST} does not yield consistent improvements in performance.
The performance of \texttt{ResIST} and local SGD are strikingly similar in terms of test accuracy.
In fact, the performance gap between the two method does not exceed 1\% in any experimental setting.
% In fact, the performance gap between \texttt{ResIST} and local SGD never exceeds 1\% in any of the experimental settings that were tested for image classification.
Furthermore, \texttt{ResIST} performance remains stable as the number of sub-ResNets increases, allowing greater acceleration to be achieved without degraded performance (e.g., see CIFAR100 results in Table \ref{cifar10_results}).
Generally, using four sub-ResNets yields the best performance with \texttt{ResIST}.

% As shown in Table \ref{cifar10_results}, the performance of \texttt{ResIST} remains stable as the number of sub-ResNets is increased.
% For example, on the CIFAR100 dataset, the performance of eight sub-ResNets exceed the performance of two sub-ResNets for \texttt{ResIST}.
% Similarly for other datasets, test accuracy did not degrade significantly as the number of sub-ResNets was increased, thus allowing greater acceleration to be achieved without noticeable detriment to performance.
% Generally, the highest test accuracy was achieved with the use of four sub-ResNets for \texttt{ResIST}. 

\noindent
\textbf{Efficiency.}
In addition to achieving comparable test accuracy, \texttt{ResIST} significantly accelerates training.
This acceleration is due to $i)$ fewer parameters being communicated between machines and $ii)$ locally-trained sub-ResNets being shallower than the global model.
Wall-clock training times for four and eight machine experiments are presented in Tables \ref{small_dataset_results}. %\footnote{We report results specifically for 4 machines because this setting achieves the best performance, in terms of both speed and accuracy, in most cases for \texttt{ResIST}.}
\texttt{ResIST} provides $3.58$ to $3.81\times$ speedup in comparison to local SGD.
For eight machine experiments, a significant speedup over four machine experiments is not observed due to the minimum depth requirement and a reduction in the number of local iterations to improve training stability.
We conjecture that for cases with higher communication cost at each synchronization and a similar number of synchronizations, eight worker \texttt{ResIST} could lead to more significant speedups in comparison to the four worker case. 


% As mentioned in Sec. \ref{S:supp_tech}, sub-ResNets could become too shallow in 8-GPU model experiments, which harms the performance and stability of training. 
% Thus, we enforce a minimum depth requirement for each sub-ResNet, by simultaneously partitioning and sharing certain blocks to multiple sub-ResNets. 
% We also decrease the number of local iterations to further improve the stability. 
% This causes a increase in communication cost for each synchronization and overall longer training time compared to \texttt{ResIST} with 4-GPU. (but still maintaining the speedup compared to both four and eight model local SGD baseline). 
% This is mainly due to the fact that eight or more workers do not provide significant decrease in the number of training iterations and synchronizations to converge to the global optimal compared to four workers. 
% We conjecture that for cases with higher communication cost at each synchronization, and similar number of synchronizations, eight worker \texttt{ResIST} could lead to further speedups, compared to the four worker \texttt{ResIST}.      


% As mentioned in Sec. \ref{S:supp_tech}, sub-ResNets could become too shallow in 8-GPU model experiments, which harms the performance and stability of training. 
% Thus, we enforce a minimum depth requirement for each sub-ResNet, by simultaneously partitioning and sharing certain blocks to multiple sub-ResNets. 
% We also decrease the number of local iterations to further improve the stability. 
% This causes a increase in communication cost for each synchronization and overall longer training time compared to \texttt{ResIST} with 4-GPU. (but still maintaining the speedup compared to both four and eight model local SGD baseline). 
% This is mainly due to the fact that eight or more workers do not provide significant decrease in the number of training iterations and synchronizations to converge to the global optimal compared to four workers. 
% We conjecture that for cases with higher communication cost at each synchronization, and similar number of synchronizations, eight worker \texttt{ResIST} could lead to further speedups, compared to the four worker \texttt{ResIST}.      


% In fact, the decreased communication and training costs provided by \texttt{ResIST} can lead to accelerations of $\times 1.49$ to $\times 3.81$, compared to local SGD, in wall-clock training time. 
%Such a significant training acceleration highlights the improved efficiency of \texttt{ResIST} in comparison to local SGD.

%\textcolor{magenta}{Tasos: We need a paragraph that comments on Table 4. We need to conjecture why we do not get higher speedup.}

% \begin{table}[!htb]
%     %\caption{Total training time in seconds of models trained with both local SGD and \texttt{ResIST}.}
%     %\begin{minipage}{.47\linewidth}
%         \caption{Total training time in seconds of models trained with both local SGD and \texttt{ResIST} using four machines.} %\vspace{0.1cm}
%             \begin{tabular}{cccc}
%             \toprule
%                  & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
%                 C10 & 5486 $\pm$ 7.05 & 1532 $\pm$ 0.83 & \textbf{$\times$ 3.60}\\
%                  C100 & 5528 $\pm$ 65.90 & 1545 $\pm$ 1.27 & \textbf{$\times$ 3.58}\\
%                 %ImageNet & - & -\\
%                 VOC & 16840 $\pm$ 0.11 &  11264 $\pm$ 49.38 & \textbf{$\times$ 1.49}\\
%             \bottomrule
%             \end{tabular} \label{cifar10_speed}
%     %\end{minipage} %\hspace{0.5cm}
%     %\begin{minipage}{.47\linewidth}
%       %\vspace{-0.33cm}
%       %\begin{small}
%       \vspace{0.2cm}
% \caption{Total training time in seconds of models trained with both local SGD and \texttt{ResIST} using eight machines.} %\vspace{0.1cm}
% \begin{tabular}{cccc}
% \toprule
%      & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
%     C10 & 10072 $\pm$ 5.12 & 2671 $\pm$ 3.25 & \textbf{$\times$ 3.77}\\
%     C100 & 10058 $\pm$ 8.71 & 2639 $\pm$ 3.89 & \textbf{$\times$ 3.81}\\
%     %ImageNet & - & -\\
%  \bottomrule
% % \toprule
% %     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
% %     CIFAR10 & 3735.23s $\pm$ 7.05 & \textbf{952.05s $\pm$ 0.83} & \\
% %     CIFAR100 & 3649.98s $\pm$ 65.90 & \textbf{948.94s $\pm$ 1.27} & \\
% %     SVHN & 5551.84s $\pm$ 9.27 & \textbf{1374.53s $\pm$ 0.74} & \\
% %     %ImageNet & - & -\\
% %     PascalVOC & 16840.42s & \textbf{11264.30s $\pm$ 49.38} & \\
% %  \bottomrule
% \end{tabular}
% \label{eight_speed}
% %\end{small}
% %    \end{minipage} 
%     %\vspace{-0.6cm}
% \end{table}

\begin{table}[!htp]
\centering

\caption{Training time in seconds of baseline models and models trained with \texttt{ResIST} on small-scale image classification datasets.} 
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{small}
\begin{tabular}{ccccccc}
\toprule
    & \# Machines & Dataset & Total Time & Speedup \\ \midrule
    Local SGD & 4 & C10 & 5486 $\pm$ 7.05 & -\\
    & & C100 & 5528 $\pm$ 65.90 & - \\
    %& & VOC & 16840 $\pm$ 0.11 & - \\
     & 8 & C10 & 10072 $\pm$ 5.12 & -\\
      &  & C100 & 10058 $\pm$ 8.71 & -\\
    \midrule
        \texttt{ResIST} & 4 & C10 & 1532 $\pm$ 0.83 & \textbf{3.60$\times$ }\\
    & & C100 &  1545 $\pm$ 1.27 & \textbf{3.58$\times$} \\
    %& & VOC & 11264 $\pm$ 49.38 & \textbf{$\times$ 1.49} \\
     & 8 & C10 & 2671 $\pm$ 3.25 & \textbf{3.77$\times$}\\
      &  & C100 & 2639 $\pm$ 3.89 & \textbf{3.81$\times$}\\
 \bottomrule
\end{tabular}
\end{small}
\label{small_dataset_results}
%\vspace{-0.3cm}
\end{table}

% \begin{table}[!htp]
% \centering
% \begin{small}
% \caption{Total training time in seconds of models trained with both local SGD and \texttt{ResIST} using four machines.} \vspace{0.1cm}
% \begin{tabular}{cccc}
% \toprule
%     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
%     CIFAR10 & 5486 $\pm$ 7.05 & 1532 $\pm$ 0.83 & \textbf{$\times$ 3.60}\\
%     CIFAR100 & 5528 $\pm$ 65.90 & 1545 $\pm$ 1.27 & \textbf{$\times$ 3.58}\\
%     %ImageNet & - & -\\
%     PascalVOC & 16840 $\pm$ 0.11 &  11264 $\pm$ 49.38 & \textbf{$\times$ 1.49}\\
%  \bottomrule
% % \toprule
% %     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
% %     CIFAR10 & 3735.23s $\pm$ 7.05 & \textbf{952.05s $\pm$ 0.83} & \\
% %     CIFAR100 & 3649.98s $\pm$ 65.90 & \textbf{948.94s $\pm$ 1.27} & \\
% %     SVHN & 5551.84s $\pm$ 9.27 & \textbf{1374.53s $\pm$ 0.74} & \\
% %     %ImageNet & - & -\\
% %     PascalVOC & 16840.42s & \textbf{11264.30s $\pm$ 49.38} & \\
% %  \bottomrule
% \end{tabular}
% \label{cifar10_speed}
% \end{small}
% \vspace{-0.3cm}
% \end{table}

% \begin{table}[!htp]
% \centering
% \begin{small}
% \caption{Total training time in seconds of models trained with both local SGD and \texttt{ResIST} using eight machines.} \vspace{0.1cm}
% \begin{tabular}{cccc}
% \toprule
%     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
%     CIFAR10 & 10072 $\pm$ 5.12 & 2671 $\pm$ 3.25 & \textbf{$\times$ 3.77}\\
%     CIFAR100 & 10058 $\pm$ 8.71 & 2639 $\pm$ 3.89 & \textbf{$\times$ 3.81}\\
%     %ImageNet & - & -\\
%  \bottomrule
% % \toprule
% %     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
% %     CIFAR10 & 3735.23s $\pm$ 7.05 & \textbf{952.05s $\pm$ 0.83} & \\
% %     CIFAR100 & 3649.98s $\pm$ 65.90 & \textbf{948.94s $\pm$ 1.27} & \\
% %     SVHN & 5551.84s $\pm$ 9.27 & \textbf{1374.53s $\pm$ 0.74} & \\
% %     %ImageNet & - & -\\
% %     PascalVOC & 16840.42s & \textbf{11264.30s $\pm$ 49.38} & \\
% %  \bottomrule
% \end{tabular}
% \label{eight_speed}
% \end{small}
% \vspace{-0.1cm}
% \end{table}
%\vspace{-0.3cm}

\begin{figure}[h]
    %\vspace{-0.7cm}
    \centering
    \includegraphics[width=1\linewidth]{images/cifar10_timing.pdf}

    \caption{%Test accuracy of ResIST and local SGD (LSGD) on CIFAR10 and CIFAR100. 
    Both methodologies complete 160 epochs of training. Accuracy values are smoothed using a 1-D gaussian filter, and shaded regions represent deviations in accuracy.}
    \label{fig:cifar10_times}
\end{figure}

A visualization of the speedup provided by \texttt{ResIST} on the CIFAR10 and CIFAR100 datasets is illustrated in Fig. \ref{fig:cifar10_times}.
% An acceleration over local SGD is achieved using \texttt{ResIST} with both 2- (not shown graphically) and 4-GPU sub-ResNets
%\textcolor{magenta}{Tasos: the figures show only one case - probably the 8-GPU case. Can we have the 2 and 4 GPU cases also? Not sure whether these should be in different figures.}
Models trained with \texttt{ResIST} match the final accuracy of those trained with local SGD.
Furthermore, increasing the number of sub-ResNets yields an improved speedup for \texttt{ResIST} in comparison to localSGD. 
%\emph{In comparison, using four machines for local SGD actually slowed down training in comparison to using two machines (i.e., this slow down is caused by increased communication costs of local SGD in the 4-GPU setting).}
It is clear that the communication-efficiency of \texttt{ResIST} allows the benefit of more devices to be better realized in the distributed setting. 

%\vspace{-0.2cm}
\subsection{Large-Scale Image Classification}
%\vspace{-0.2cm}

\noindent
\textbf{Accuracy.} The test accuracy of models trained with both \texttt{ResIST} and local SGD for different numbers of machines on the ImageNet dataset is listed in Table \ref{imagenet_results}.
As can be seen, \emph{\texttt{ResIST} achieves comparable test accuracy ($<2\%$ difference) to local SGD in all cases.}
Additionally, as shown in \citep{recht2019imagenet}, many current image classification models overfit to the ImageNet test set and cannot generalize well to new data. 
Thus, models trained with both local SGD and \texttt{ResIST} are also evaluated on three different Imagenet V2 testing sets \citep{recht2019imagenet}.
As shown in Table \ref{imagenet_results}, \texttt{ResIST} consistently achieves comparable test accuracy in comparison to local SGD on these supplemental test sets. 

%& Imagenet V2 MatchedFrequency &Imagenet V2 Threshold0.7 & Imagenet V2 TopImages

\noindent
\textbf{Efficiency.} 
As shown in Tables \ref{imagenet_results} and \ref{reach_imagenet_speed}, \texttt{ResIST} significantly accelerates the ImageNet training process.
However, due to the use of fewer local iterations and the local SGD warm-up phase, the speedup provided by \texttt{ResIST} is smaller relative to experiments on small-scale datasets.
In Table \ref{imagenet_results}, \texttt{ResIST} can reduce the total communication volume during training, which is an important feature in the implementation of distributed systems with high computational costs.



\begin{table}[!htp]
\centering
\begin{small}
\caption{Total training time on Imagenet (in hours) of models trained with both local SGD and \texttt{ResIST} using two and four machines to reach a fixed test accuracy.} 
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{tabular}{ccccc}
\toprule
    \# Machines & Target Accuracy & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
    2  & 71.00 & 33.26 & 26.63 & \textbf{1.25$\times$ }\\
    4  & 70.70 & 18.50 & 18.12 & \textbf{1.02$\times$ }\\

 \bottomrule
% \toprule
%     Dataset & Local SGD & \texttt{ResIST} & Speedup \\ \midrule
%     CIFAR10 & 3735.23s $\pm$ 7.05 & \textbf{952.05s $\pm$ 0.83} & \\
%     CIFAR100 & 3649.98s $\pm$ 65.90 & \textbf{948.94s $\pm$ 1.27} & \\
%     SVHN & 5551.84s $\pm$ 9.27 & \textbf{1374.53s $\pm$ 0.74} & \\
%     %ImageNet & - & -\\
%     PascalVOC & 16840.42s & \textbf{11264.30s $\pm$ 49.38} & \\
%  \bottomrule
\end{tabular}
\label{reach_imagenet_speed}
\end{small}

\end{table}

\subsection{Object Detection}
\textbf{Loss.} The test loss of models trained with both \texttt{ResIST} and local SGD for different numbers of machines on the Pascal VOC object detection dataset is listed in Table \ref{obj_detection}.
Notably, \texttt{ResIST} achieves a lower test loss in comparison to local SGD for the experiment with two machines.
Although the test loss achieved by \texttt{ResIST} is slightly worse than local SGD in the four machine case, the performance is comparable.
Namely, the difference in test loss achieved by local SGD and \texttt{ResIST} never exceeds a value of one.
% Notably, compared with local SGD, \texttt{ResIST} achieves lower test loss in experiment with two machines and comparable test loss in experiment with four machines. 

\noindent
\textbf{Efficiency.} In addition to achieving comparable or improved test loss in comparison to local SGD, \texttt{ResIST} also provides a significant training acceleration on the PascalVOC dataset.
In particular, models trained with \texttt{ResIST} achieve up to a $1.64\times$ acceleration in comparison to object detection models trained with localSGD. %\texttt{ResIST} achieves comparable and even better test loss, it also accelerates training as shown in Table \ref{obj_detection} up to \textbf{$\times$ 1.64}. 


\begin{table}[!htp]
\centering

\begin{small}
\caption{Test loss and total training time in seconds on Pascal VOC for models trained with both local SGD and \texttt{ResIST} using two and four machines. Training time in seconds.} 
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{tabular}{ccccc}
\toprule
    & \# Machines & Test Loss & Train Time & Speedup \\ \midrule
    Local SGD & 2 & 6.15 $\pm$ 0.03 & 39621 $\pm$ 9.12 & -\\
    & 4 & 6.22 $\pm$ 0.06 & 16840 $\pm$ 0.11 & -\\
    \midrule
    \texttt{ResIST} & 2 & 5.99 $\pm$ 0.01 & 24058 $\pm$ 3.22 & \textbf{1.64$\times$ }\\
    & 4 & 6.69 $\pm$ 0.17 & 11264 $\pm$ 49.38 & \textbf{1.49$\times$}\\
 \bottomrule
\end{tabular}
\label{obj_detection}
\end{small}

\end{table}


\subsection{More experiments}

In the Appendix A, we outline numerous ablation experiments that were performed using \texttt{ResIST}.
These experiments provide an understanding of the algorithm's behavior, as well as empirical support for its design: they include \texttt{ResIST} design decisions (section A.1), comparison of \texttt{ResIST} with ensemble methods (section A.2), robustness to local iterations (section A.3), applicability of \texttt{ResIST} to deeper architectures (section A.4), and compatibility to existing quantization/sparsification techniques (section A.5).
% \section{Ablations}
% %We outline numerous ablation experiments that were performed using \texttt{ResIST}.
% These experiments provide an understanding of the algorithm's behavior, as well as empirical support for its design.

% \subsection{Designing \texttt{ResIST}} \label{design_ablation}
% Extensive ablation experiments are conducted on the CIFAR10 dataset, outlined in Fig. \ref{resist_ablations}, to empirically motivate the design choices made within \texttt{ResIST} (i.e., see Sec. \ref{S:supp_tech}).
% For the two sub-ResNet case, the naive implementation of \texttt{ResIST}, which evenly splits all convolutional blocks between subnetworks, is shown to perform poorly (i.e., $<$70\% on CIFAR10).
% The accuracy of \texttt{ResIST} is improved over 25\% by only allowing select layers to be partitioned and ensuring activations are scaled correctly when performing inference with the full network.
% The pre-activation ResNet is shown to yield an improvement in accuracy, leading \texttt{ResIST} to perform near optimally with two sub-ResNets.

% \begin{figure}[!htp]
% \includegraphics[width=3.25in]{images/resist_ablations.png} \vspace{-0.3cm}
% \caption{Test accuracies on the CIFAR10 dataset for a single run for the major ablation experiments performed with \texttt{ResIST}.}
% \label{resist_ablations}

% %\vspace{-0.2cm}
% \end{figure}

% When \texttt{ResIST} is expanded to eight sub-ResNets, we initially observe a significant decrease in model accuracy.
% However, as can be seen in Fig. \ref{resist_ablations}, this gap can be closed by enforcing a minimum depth on sub-ResNets and tuning the number of local iterations.
% By making these extra modifications, \texttt{ResIST} begins to perform similarly with two to eight sub-ResNets, yielding compelling performance.% in different experimental settings.

% \subsection{Shallow Ensembles}

% % \begin{table}[]
% % \centering
% % \begin{scriptsize}
% % \caption{Performance of indpendently-trained ensembles of shallow ResNets on CIFAR10 and CIFAR100 (denoted as C10 and C100, respectively.}
% %  \vspace{0.1cm}
% % \begin{tabular}{cccccc}
% % \toprule

% %    Dataset & Method & 2 Model & 4 Model & 8 Model \\ \midrule
% %    C10 & Ensemble & 92.27 % & 92.53 & 90.67 \% \\
% %    & \texttt{ResIST} & 91.95\% $\pm$ 0.32 & 92.35\% $\pm$ 0.22 & 91.29 $\pm$ 0.12\\
% %    \midrule
% %    C100 & Ensemble & 72.08\% $\pm$ 0.05 & 72.12\% & 68.10\% \\
% %    & \texttt{ResIST} & 70.06\% $\pm$ 0.51 & 71.30\% $\pm$ 0.20 & 70.18\% $\pm$ 0.21 \\


% %  \bottomrule
% % \end{tabular}
% % \label{ensemble_perf}
% % \end{scriptsize}
% % \end{table}


% The \texttt{ResIST} algorithm requires that independently-trained sub-ResNets must have their parameters synchronized intermittently. 
% Such synchronization, however, can be completely avoided by training each sub-ResNet separately and forming an ensemble (i.e., \texttt{ResIST} without any aggregation).
% Although maintaining an ensemble has several drawbacks (e.g., slower inference, more parameters, etc.), the training time of the ensemble would nonetheless be reduced in comparison to \texttt{ResIST} by avoiding communication altogether. 
% Therefore, the performance of such an ensemble should be compared to the models trained with \texttt{ResIST}.

% \begin{table}[!ht]
% %\vspace{-0.4cm}
% \centering
% \begin{scriptsize}
% \caption{Performance of indpendently-trained ensembles of shallow ResNets in comparison to \texttt{ResIST} on CIFAR10 and CIFAR100 (denoted as C10 and C100, respectively).}
%  \vspace{0.2cm}
%  \setlength{\tabcolsep}{.2\tabcolsep}
% \begin{tabular}{ccccccccc}
% \toprule
% %    Dataset & Method & 2 Model & 4 Model & 8 Model \\ \midrule
% %    C10 & Ensemble & \textbf{92.27\% $\pm$ 0.00} & 91.19\% $\pm$ 0.01 & 88.04\% $\pm$ 0.02 \\
% %    & \texttt{ResIST} & 91.95\% $\pm$ 0.32 & \textbf{91.78\% $\pm$ 0.82} & \textbf{91.29\% $\pm$ 0.12}\\
% %    \midrule
% %    C100 & Ensemble & \textbf{72.08\% $\pm$ 0.05} & 69.21\% $\pm$ 0.08 & 60.90\% $\pm$ 0.12\\
% %    & \texttt{ResIST} & 70.06\% $\pm$ 0.51 & \textbf{71.28\% $\pm$ 0.24}  & \textbf{70.18\% $\pm$ 0.21} \\
%     Dataset & Method & & 2 Model & &  4 Model & & 8 Model \\ \midrule
%     C10 & Ensemble & & 92.27 \% $\pm$ 0.00 & &  92.56\% $\pm$ 0.03 & & 90.67 \% $\pm$ 0.04 \\    
%     & \texttt{ResIST} & & 91.95\% $\pm$ 0.32 & & 92.35\% $\pm$ 0.22 & & 91.45\% $\pm$ 0.30\\
%     \midrule
%     C100 & Ensemble & & 72.08\% $\pm$ 0.05 & & 72.12\% $\pm$ 0.04 & & 67.98 \% $\pm$ 0.12 \\
%     & \texttt{ResIST} & & 70.06\% $\pm$ 0.51 & & 71.30\% $\pm$ 0.20 & & 70.26\% $\pm$ 0.21 \\
%  \bottomrule
% \end{tabular}
% \label{ensemble_perf}
% \end{scriptsize}
% \vspace{-0.3cm}
% \end{table}

% \begin{table*}[!t]
% \centering
% \caption{Test accuracy on CIFAR10 (C10) and CIFAR100 (C100) for deeper architectures trained with \texttt{ResIST} and local SGD (LSGD). All tests were performed with 100 local iterations between synchronization rounds. All models were trained for 80 epochs.}
% \vspace{0.1cm}
% \begin{small}
% \begin{tabular}{ccc|ccc|ccc}
% \toprule
%      &&& \multicolumn{3}{c|}{ResNet152} & \multicolumn{3}{c}{ResNet200}\\
%      Dataset & \# Machines & Method & Time & Test Acc. & Speedup & Time & Test Acc. & Speedup  \\
%      \midrule
%      C10 & 2 & LSGD & 3512s & 92.27\% $\pm$ 0.003 & & 4575s & 92.31\% $\pm$ 0.001 & \\
%      && \texttt{ResIST} & 2215s & 92.01\% $\pm$ 0.002 & \textbf{1.58$\times$} & 2380s & 92.10\% $\pm$ 0.001 & \textbf{1.92$\times$} \\
%      %\midrule
%      & 4 & LSGD & 3598s & 91.39\% $\pm$ 0.001 &  & 4357s & 91.35\% $\pm$ 0.000 & \\
%      && \texttt{ResIST} & 1054s & 90.67\% $\pm$ 0.001 & \textbf{3.41$\times$} &  1161s & 90.27\% $\pm$ 0.001 & \textbf{3.75$\times$} \\
%      \midrule
%       C100 & 2 & LSGD & 3528s & 70.50\% $\pm$ 0.003 & & 4639s & 71.05\% $\pm$ 0.005 & \\
%       && \texttt{ResIST} & 2291s & 70.32\% $\pm$ 0.005 & \textbf{1.53$\times$} & 2202s & 70.71\% $\pm$ 0.002 & \textbf{2.10$\times$} \\
%      % \midrule
%       & 4 & LSGD & 3518s & 68.39\% $\pm$ 0.004 & & 4391s & 69.05\% $\pm$ 0.003 & \\
%       && \texttt{ResIST} & 1164s & 67.27\% $\pm$ 0.003 & \textbf{3.02$\times$} & 1195s & 67.62\% $\pm$ 0.001 & \textbf{3.67$\times$} \\
%  \bottomrule
% \end{tabular}
% \end{small}
% \label{tab:deep_net_results}
% \vspace{-0.4cm}
% \end{table*}

% The performance of sub-ResNet ensembles in comparison to models trained with \texttt{ResIST} is displayed in Table \ref{ensemble_perf}.
% For 8 Sub-ResNets, the shallow ensembles achieve inferior performance in comparison to \texttt{ResIST}.
% When two and four Sub-ResNets are used, the performance of shallow ensembles and \texttt{ResIST} is comparable (i.e., $<1\%$ performance difference in most cases).
% However, it should be noted that such shallow ensembles of two or four sub-ResNets, in comparison to \texttt{ResIST}, cause a $2\times$ to $4\times$ slowdown in inference time (i.e., inference time for a single Sub-ResNet is not significantly faster than that of the global ResNet).
% Furthermore, the ensembles consume more parameters in comparison to global ResNet trained with \texttt{ResIST}. 

% \subsection{Robustness to Local Iterations}
% \label{S:local_iter}

% \begin{figure}[!htp]
%     \centering
%     %\hspace{-0.4cm} 
%     \includegraphics[width=0.9\linewidth]{images/c100_local_iter.pdf} \vspace{-0.4cm}
%     \caption{Test accuracy on CIFAR100 for ResNet-101 trained with both \texttt{ResIST} and local SGD (LSGD) with different numbers of local iterations. $\infty$ local iterations refers to aggregating parameters only once at the end of training (i.e., single-shot averaging). Shaded regions reflect deviations in accuracy.}
%     \label{fig:local_iter}
% %    \Description{Test accuracy on CIFAR100 for ResNet-101 trained with both \texttt{ResIST} and local SGD (LSGD) with different numbers of local iterations}
%     \vspace{-0.5cm}
% \end{figure}

% \texttt{ResIST} is robust to various numbers of local iterations \citep{use_local_sgd, parallel_sgd, fed_avg}.
% An extensive sweep over possible values of $\ell$ is performed on CIFAR100.
% The results of this experiment are depicted in Fig. \ref{fig:local_iter}.
% As can be seen, \texttt{ResIST} achieves high accuracy even with thousands of local SGD iterations (i.e., previous work typically uses much fewer \citep{use_local_sgd}).
% However, if more sub-ResNets are used, performance tends to deteriorate more quickly as local iterations increase.
% Due to the robustness of \texttt{ResIST} to large numbers of local iterations, training can be accelerated without deteriorating model performance by simply increasing the value of $\ell$.
% Local SGD was found to demonstrate similar robustness to the number of local iterations, as shown in Fig. \ref{fig:local_iter}.



% \subsection{Deeper architectures}
% \label{S:deep_arch}
% The \texttt{ResIST} methodology is easily applicable to deeper architectures.
% To demonstrate this, results are replicated for CIFAR10 and CIFAR100 datasets with ResNet152 and ResNet200.
% These deeper architectures are identical to the original ResNet101 architecture (i.e., see Fig. \ref{model_depict}).
% However, more residual blocks are added to the third section of the ResNet (i.e., the highlighted portion of Fig. \ref{model_depict}) to increase the model's depth.
% It should be noted that convolutional blocks within the third section of the ResNet are partitioned in \texttt{ResIST} by default (see Sec. \ref{subnet_sec}). 
% As a result, all extra residual blocks within these deeper architectures are partitioned to sub-ResNets by \texttt{ResIST} (i.e., no extra blocks are shared between sub-ResNets), allowing \texttt{ResIST} to achieve greater acceleration in comparison to local SGD. 


% The results of experiments with deeper ResNets are presented in Table \ref{tab:deep_net_results}. %\footnote{Due to limitations in computational resources, all models in this section were trained for 80 epochs.}
% \texttt{ResIST} performs competitively with localSGD in all cases.
% Furthermore, \texttt{ResIST} achieves a significant speedup in comparison to local SGD that becomes more pronounced as the model becomes deeper.
% E.g., for 4-GPUs, \texttt{ResIST} completes training $>3 \times$ faster than local SGD for ResNet200 on both datasets.
% This speedup is caused by a greater ratio of total network blocks being partitioned to sub-ResNets in \texttt{ResIST}.
% While local SGD must communicate all parameters between machines, \texttt{ResIST} achieves a relative decrease in communication by partitioning all extra residual blocks evenly between sub-ResNets.

% \subsection{\texttt{ResIST} and Quantization/Sparse Gradients}
% \label{S:quant}

% Many quantization \citep{commeff-sgd, double-quant} and sparsification \citep{sparse-comm, linear-speed-quant} techniques have been proposed for reducing communication costs in distributed training.
% Such techniques focus on compressing communicated data, and they do not interfere with our methodology, which provides a novel approach to model synchronization and training.
% The proposed approach can be easily combined with existing compression techniques to further reduce communication costs and accelerate training \emph{with no extra tuning or modifications}.
% To demonstrate that \texttt{ResIST} works well with quantization, we compress all communicated parameters using both four-bit and eight-bit compression.
% Table \ref{quantization} shows that \texttt{ResIST} retains its performance until the compression level reaches five-bit and lower.
% We also perform experiments with sparsification of communicated weights by only keeping 25\% of total weights within each synchronization round. 
% Such a strategy reaches a validation peformance of 71.25\% on CIFAR100.
% We summarize the results of all quantization experiments in Fig. \ref{fig:budget}, where we compare communication budgets across different compression techniques with \texttt{ResIST}.
% From this figure, it is clear that \texttt{ResIST} is most efficient with six-bit quantization and is compatible with most main-stream compression techniques.

% % We also experiment with compressing all communicated weight with weight sparsification by only keeping top 25\% weights which reaches 71.25\% in CIFAR100.
% % We summarized all results in Figure \ref{fig:budget}, which compares the communication budget across different compression techniques and \texttt{ResIST} variants combined with these techniques.
% % It shows \texttt{ResIST}+6 bit quantization is most communication efficient and \texttt{ResIST} is compatible with both main types of compression techniques.

% \begin{table}[!htp]
% \vspace{-0.3cm}
% \centering
% \begin{small}
% \caption{Test Accuracy for \texttt{ResIST} combined with quantization on CIFAR10 and CIFAR100 (denoted as C10 and C100).}
% \vspace{0.1cm}
% \setlength{\tabcolsep}{.5\tabcolsep}
% \begin{tabular}{cccccc}
% \toprule
%     Dataset  & 8 bit & 7 bit & 6 bit & 5 bit & 4 bit\\ \midrule
%     C10   & 92.14\% & 92.26\% & 91.91\% & 91.35\%  & 76.33\%\\
%     C100  & 71.38\% & 72.15\% & 71.37\% & 68.29\% & 40.48\%\\
%   \bottomrule
%  \end{tabular}
%  \label{quantization}
% \end{small}
% \vspace{-0.3cm}
%  \end{table}
 
%  \begin{figure}
%  \vspace{-0.2cm}
%     \centering
%     \includegraphics[width=0.9\linewidth]{images/communication_budget_2.png} \vspace{-0.4cm}
%     \caption{Test accuracy vs. communication budget for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100. All models are trained over a 4-GPU cluster.}
%     \label{fig:budget}
%     %\Description{Test accuracy vs. communication budget for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100.}
%     \vspace{-0.6cm}
% \end{figure}
 

\section{Conclusion}

In the work, we present \texttt{ResIST}, a novel algorithm for synchronous, distributed training of ResNets.
\texttt{ResIST} operates by decomposing a global ResNet model into several shallower sub-ResNets, which are trained independently and itermittently aggregated into the global model. 
 By only communicating parameters of sub-ResNets between machines and training shallower, less expensive networks, \texttt{ResIST} reduces the communication and local training cost of synchronous, distributed training.
 We demonstrate the impact of \texttt{ResIST} on several image classification datasets, as well as in the object detection domain, by highlighting the significant training acceleration it provides in comparison to methods like local SGD \citep{use_local_sgd} without any deterioration in performance. 

%\textcolor{magenta}{Maybe include some future work lines?}
We aim to extend \texttt{ResIST} to other network architectures, as \texttt{ResIST} is fully-extensible to all network architectures with residual connections.
Because residual connections are now standard in most important deep learning architectures (e.g., transformers), many opportunities to extend applications of \texttt{ResIST} exist.
On the other hand, \texttt{ResIST} has been shown to be fully-compatible with various gradient compression methods.
As such, we will investigate the prospect of fully integrating such compression methods within \texttt{ResIST}, both during training and communication phases, to further decrease memory and computation costs.
% As residu connection became standard in most important architectures, such as transformers \citep{transformer, bert, rezero}, and as \texttt{ResIST} is fully extendable to all residue models through layer wise partition, we will further experiments on applying \texttt{ResIST} to those architectures in the future work.
% On the other hand, as \texttt{ResIST} is shown to be fully compatible with various gradient compression method during communication, we will investigate in fully integrating these compression method with \texttt{ResIST} in both training and communication phase to further decrease the memory and computation cost. 
%%

%\clearpage
\bibliography{dun_187.bib}

% \appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}
\onecolumn
%\input{appendix_fix.tex}
\end{document}
