%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{enumitem}
\usepackage{multirow}

\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm}
\usepackage{algorithmic}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%for convinience
\newcommand{\vct}{\boldsymbol }
%\newcommand{\mat}{\mathbf}
\newcommand{\rnd}{\mathsf}
\newcommand{\ud}{\mathrm d}
\newcommand{\nml}{\mathcal{N}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\hinge}{\mathcal{R}}
\newcommand{\kl}{\mathrm{KL}}
\newcommand{\cov}{\mathrm{cov}}
\newcommand{\dir}{\mathrm{Dir}}
\newcommand{\mult}{\mathrm{Mult}}
\newcommand{\err}{\mathrm{err}}
\newcommand{\sgn}{\mathrm{sgn}}
%\renewcommand{\span}{\mathrm{span}}
\newcommand{\argmin}{\mathrm{argmin}}
\newcommand{\argmax}{\mathrm{argmax}}
\newcommand{\poly}{\mathrm{poly}}
\newcommand{\rank}{\mathrm{rank}}
\newcommand{\conv}{\mathrm{conv}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\diag}{\mat{diag}}
\newcommand{\acc}{\mathrm{acc}}

\newcommand{\aff}{\mathrm{aff}}
\newcommand{\range}{\mathrm{Range}}
\newcommand{\Sgn}{\mathrm{sign}}

\newcommand{\hit}{\mathrm{hit}}
\newcommand{\cross}{\mathrm{cross}}
\newcommand{\Left}{\mathrm{left}}
\newcommand{\Right}{\mathrm{right}}
\newcommand{\Mid}{\mathrm{mid}}
\newcommand{\bern}{\mathrm{Bernoulli}}
\newcommand{\ols}{\mathrm{ols}}
\newcommand{\tr}{\mathrm{tr}}
\newcommand{\opt}{\mathrm{opt}}
\newcommand{\ridge}{\mathrm{ridge}}
\newcommand{\unif}{\mathrm{unif}}
\newcommand{\Image}{\mathrm{im}}
\newcommand{\Kernel}{\mathrm{ker}}
\newcommand{\supp}{\mathrm{supp}}
\newcommand{\pred}{\mathrm{pred}}
\newcommand{\distequal}{\stackrel{\mathbf{P}}{=}}
%\newcommand{\gege}{\textcircled{1}}
\newcommand{\gege}{{A(\vect{w},\vect{w}_*)}}
\newcommand{\gele}{{A(\vect{w},-\vect{w}_*)}}
\newcommand{\lele}{{A(-\vect{w},-\vect{w}_*)}}
\newcommand{\lege}{{A(-\vect{w},\vect{w}_*)}}
\newcommand{\firstlayer}{\mathbf{W}}
\newcommand{\firstlayerWN}{v}
\newcommand{\secondlayer}{a}
\newcommand{\inputvar}{\vect{x}}
\newcommand{\anglemat}{\mathbf{\Phi}}
\newcommand{\holder}{H\"{o}lder }

\def\R{\mathbb{R}}
\def\Z{\mathbb{Z}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cH{\mathcal{H}}
\def\cI{\mathcal{I}}
\def\cL{\mathcal{L}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cP{\mathcal{P}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cW{\mathcal{W}}
\def\cZ{\mathcal{Z}}
\def\bP{\mathbf{P}}
\def\TV{\mathrm{TV}}
\def\MSE{\mathrm{MSE}}

\def\vw{\mathbf{w}}
\def\va{\mathbf{a}}
\def\vZ{\mathbf{Z}}

\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\vect}[1]{\mathbf{#1}}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\inner}[1]{\left\langle#1\right\rangle}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\expect}{\mathbb{E}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\prox}[2]{\textbf{Prox}_{#1}\left\{#2\right\}}
\newcommand{\event}[1]{\mathscr{#1}}
\newcommand{\set}[1]{#1}
\newcommand{\diff}{\text{d}}
\newcommand{\difference}{\triangle}
\newcommand{\inputdist}{\mathcal{Z}}
\newcommand{\indict}{\mathbb{I}}
\newcommand{\rotmat}{\mathbf{R}}
\newcommand{\normalize}[1]{\overline{#1}}
\newcommand{\vectorize}[1]{\text{vec}\left(#1\right)}
\newcommand{\vclass}{\mathcal{G}}
\newcommand{\pclass}{\Pi}
\newcommand{\qclass}{\mathcal{Q}}
\newcommand{\rclass}{\mathcal{R}}
\newcommand{\classComplexity}[2]{N_{class}(#1,#2)}
\newcommand{\cclass}{\mathcal{F}}
\newcommand{\gclass}{\mathcal{G}}
\newcommand{\pthres}{p_{thres}}
\newcommand{\ethres}{\epsilon_{thres}}
\newcommand{\eclass}{\epsilon_{class}}
\newcommand{\states}{\mathcal{S}}
\newcommand{\lowprobstate}{\psi}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\contexts}{\mathcal{X}}
\newcommand{\edges}{\mathcal{E}}
\newcommand{\variance}{\text{Var}}
\newcommand{\params}{\vect{\theta}}
\newcommand{\sign}{\text{sign}}

\newcommand{\relu}[1]{\sigma\left(#1\right)}
\newcommand{\reluder}[1]{\sigma'\left(#1\right)}
\newcommand{\act}[1]{\sigma\left(#1\right)}
\newcommand{\kijmin}{\lambda}
\newcommand{\lambdamin}{\lambda_{\min}\left(\mat{K}^{(H)}\right)}

\newtheorem{thm}{Theorem}[section]
\newtheorem{lem}{Lemma}[section]
% \newtheorem{proof}{Proof}[section]
\newtheorem{cor}{Corollary}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{asmp}{Assumption}[section]
\newtheorem{defn}{Definition}[section]
\newtheorem{fact}{Fact}[section]
\newtheorem{conj}{Conjecture}[section]
\newtheorem{rem}{Remark}[section]
\newtheorem{example}{Example}[section]
\newtheorem{condition}{Condition}[section]

%Xiyu framework's notations
\newcommand{\gaussian}{\mathcal{P}}
\newcommand{\linfunc}{\mathcal{L}}
\newcommand{\linsub}{\mathcal{W}}
\newcommand{\detmap}{\mathcal{D}}
\newcommand{\activate}{\rho}
\newcommand{\bias}{b}
\newcommand{\error}{\mathcal{E}}
\newcommand{\wbound}{\mathfrak{W}}
\newcommand{\rhobound}{\Lambda}
\newcommand{\gaussianspace}{{\mathcal{L}^2}}
\def\thesection{\alph{section}}
\title{ResIST: Layer-Wise Decomposition of ResNets for Distributed Training (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Chen Dun}
\author[1]{Cameron R. Wolfe}
\author[1]{Christopher M. Jermaine}
\author[1]{Anastasios Kyrillidis}

% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Rice University\\
    Houston, Texas, USA
}
\allowdisplaybreaks

\begin{document}
\onecolumn
\maketitle
\section{Ablations}
%We outline numerous ablation experiments that were performed using \texttt{ResIST}.
These experiments provide an understanding of the algorithm's behavior, as well as empirical support for its design.

\subsection{Designing \texttt{ResIST}} \label{design_ablation}
Extensive ablation experiments are conducted on the CIFAR10 dataset, outlined in Fig. \ref{resist_ablations}, to empirically motivate the design choices made within \texttt{ResIST} (i.e., see Sec. \ref{S:supp_tech}).
For the two sub-ResNet case, the naive implementation of \texttt{ResIST}, which evenly splits all convolutional blocks between subnetworks, is shown to perform poorly (i.e., $<$70\% on CIFAR10).
The accuracy of \texttt{ResIST} is improved over 25\% by only allowing select layers to be partitioned and ensuring activations are scaled correctly when performing inference with the full network.
The pre-activation ResNet is shown to yield an improvement in accuracy, leading \texttt{ResIST} to perform near optimally with two sub-ResNets.

\begin{figure}[!htp]
\centering
\includegraphics[width=5in]{images/resist_ablations.png} 
\caption{Test accuracies on the CIFAR10 dataset for a single run for the major ablation experiments performed with \texttt{ResIST}.}
\label{resist_ablations}

%\vspace{-0.2cm}
\end{figure}

When \texttt{ResIST} is expanded to eight sub-ResNets, we initially observe a significant decrease in model accuracy.
However, as can be seen in Fig. \ref{resist_ablations}, this gap can be closed by enforcing a minimum depth on sub-ResNets and tuning the number of local iterations.
By making these extra modifications, \texttt{ResIST} begins to perform similarly with two to eight sub-ResNets, yielding compelling performance.% in different experimental settings.

\subsection{Shallow Ensembles}\label{shallow_ensembles}

% \begin{table}[]
% \centering
% \begin{scriptsize}
% \caption{Performance of indpendently-trained ensembles of shallow ResNets on CIFAR10 and CIFAR100 (denoted as C10 and C100, respectively.}
%  \vspace{0.1cm}
% \begin{tabular}{cccccc}
% \toprule

%    Dataset & Method & 2 Model & 4 Model & 8 Model \\ \midrule
%    C10 & Ensemble & 92.27 % & 92.53 & 90.67 \% \\
%    & \texttt{ResIST} & 91.95\% $\pm$ 0.32 & 92.35\% $\pm$ 0.22 & 91.29 $\pm$ 0.12\\
%    \midrule
%    C100 & Ensemble & 72.08\% $\pm$ 0.05 & 72.12\% & 68.10\% \\
%    & \texttt{ResIST} & 70.06\% $\pm$ 0.51 & 71.30\% $\pm$ 0.20 & 70.18\% $\pm$ 0.21 \\


%  \bottomrule
% \end{tabular}
% \label{ensemble_perf}
% \end{scriptsize}
% \end{table}


The \texttt{ResIST} algorithm requires that independently-trained sub-ResNets must have their parameters synchronized intermittently. 
Such synchronization, however, can be completely avoided by training each sub-ResNet separately and forming an ensemble (i.e., \texttt{ResIST} without any aggregation).
Although maintaining an ensemble has several drawbacks (e.g., slower inference, more parameters, etc.), the training time of the ensemble would nonetheless be reduced in comparison to \texttt{ResIST} by avoiding communication altogether. 
Therefore, the performance of such an ensemble should be compared to the models trained with \texttt{ResIST}.

\begin{table}[!ht]
%\vspace{-0.4cm}
\centering
\caption{Performance of indpendently-trained ensembles of shallow ResNets in comparison to \texttt{ResIST} on CIFAR10 and CIFAR100 (denoted as C10 and C100, respectively).}
 \vspace{0.2cm}
 \setlength{\tabcolsep}{.2\tabcolsep}
\begin{tabular}{ccccccccc}
\toprule
%    Dataset & Method & 2 Model & 4 Model & 8 Model \\ \midrule
%    C10 & Ensemble & \textbf{92.27\% $\pm$ 0.00} & 91.19\% $\pm$ 0.01 & 88.04\% $\pm$ 0.02 \\
%    & \texttt{ResIST} & 91.95\% $\pm$ 0.32 & \textbf{91.78\% $\pm$ 0.82} & \textbf{91.29\% $\pm$ 0.12}\\
%    \midrule
%    C100 & Ensemble & \textbf{72.08\% $\pm$ 0.05} & 69.21\% $\pm$ 0.08 & 60.90\% $\pm$ 0.12\\
%    & \texttt{ResIST} & 70.06\% $\pm$ 0.51 & \textbf{71.28\% $\pm$ 0.24}  & \textbf{70.18\% $\pm$ 0.21} \\
    Dataset & Method & & 2 Model & &  4 Model & & 8 Model \\ \midrule
    C10 & Ensemble & & 92.27 \% $\pm$ 0.00 & &  92.56\% $\pm$ 0.03 & & 90.67 \% $\pm$ 0.04 \\    
    & \texttt{ResIST} & & 91.95\% $\pm$ 0.32 & & 92.35\% $\pm$ 0.22 & & 91.45\% $\pm$ 0.30\\
    \midrule
    C100 & Ensemble & & 72.08\% $\pm$ 0.05 & & 72.12\% $\pm$ 0.04 & & 67.98 \% $\pm$ 0.12 \\
    & \texttt{ResIST} & & 70.06\% $\pm$ 0.51 & & 71.30\% $\pm$ 0.20 & & 70.26\% $\pm$ 0.21 \\
 \bottomrule
\end{tabular}
\label{ensemble_perf}
\end{table}

\begin{table*}
\centering
\caption{Test accuracy on CIFAR10 (C10) and CIFAR100 (C100) for deeper architectures trained with \texttt{ResIST} and local SGD (LSGD). All tests were performed with 100 local iterations between synchronization rounds. All models were trained for 80 epochs.}
\vspace{0.1cm}
\begin{small}
\begin{tabular}{ccc|ccc|ccc}
\toprule
     &&& \multicolumn{3}{c|}{ResNet152} & \multicolumn{3}{c}{ResNet200}\\
     Dataset & \# Machines & Method & Time & Test Acc. & Speedup & Time & Test Acc. & Speedup  \\
     \midrule
     C10 & 2 & LSGD & 3512s & 92.27\% $\pm$ 0.003 & & 4575s & 92.31\% $\pm$ 0.001 & \\
     && \texttt{ResIST} & 2215s & 92.01\% $\pm$ 0.002 & \textbf{1.58$\times$} & 2380s & 92.10\% $\pm$ 0.001 & \textbf{1.92$\times$} \\
     %\midrule
     & 4 & LSGD & 3598s & 91.39\% $\pm$ 0.001 &  & 4357s & 91.35\% $\pm$ 0.000 & \\
     && \texttt{ResIST} & 1054s & 90.67\% $\pm$ 0.001 & \textbf{3.41$\times$} &  1161s & 90.27\% $\pm$ 0.001 & \textbf{3.75$\times$} \\
     \midrule
      C100 & 2 & LSGD & 3528s & 70.50\% $\pm$ 0.003 & & 4639s & 71.05\% $\pm$ 0.005 & \\
      && \texttt{ResIST} & 2291s & 70.32\% $\pm$ 0.005 & \textbf{1.53$\times$} & 2202s & 70.71\% $\pm$ 0.002 & \textbf{2.10$\times$} \\
     % \midrule
      & 4 & LSGD & 3518s & 68.39\% $\pm$ 0.004 & & 4391s & 69.05\% $\pm$ 0.003 & \\
      && \texttt{ResIST} & 1164s & 67.27\% $\pm$ 0.003 & \textbf{3.02$\times$} & 1195s & 67.62\% $\pm$ 0.001 & \textbf{3.67$\times$} \\
 \bottomrule
\end{tabular}
\end{small}
\label{tab:deep_net_results}
\end{table*}

The performance of sub-ResNet ensembles in comparison to models trained with \texttt{ResIST} is displayed in Table \ref{ensemble_perf}.
For 8 Sub-ResNets, the shallow ensembles achieve inferior performance in comparison to \texttt{ResIST}.
When two and four Sub-ResNets are used, the performance of shallow ensembles and \texttt{ResIST} is comparable (i.e., $<1\%$ performance difference in most cases).
However, it should be noted that such shallow ensembles of two or four sub-ResNets, in comparison to \texttt{ResIST}, cause a $2\times$ to $4\times$ slowdown in inference time (i.e., inference time for a single Sub-ResNet is not significantly faster than that of the global ResNet).
Furthermore, the ensembles consume more parameters in comparison to global ResNet trained with \texttt{ResIST}. 

\subsection{Robustness to Local Iterations}
\label{S:local_iter}

\begin{figure}[!htp]
    \centering
    %\hspace{-0.4cm} 
    \includegraphics[width=0.7\linewidth]{images/c100_local_iter.pdf} \vspace{-0.4cm}
    \caption{Test accuracy on CIFAR100 for ResNet-101 trained with both \texttt{ResIST} and local SGD (LSGD) with different numbers of local iterations. $\infty$ local iterations refers to aggregating parameters only once at the end of training (i.e., single-shot averaging). Shaded regions reflect deviations in accuracy.}
    \label{fig:local_iter}
%    \Description{Test accuracy on CIFAR100 for ResNet-101 trained with both \texttt{ResIST} and local SGD (LSGD) with different numbers of local iterations}
    \vspace{-0.5cm}
\end{figure}

\texttt{ResIST} is robust to various numbers of local iterations \citep{use_local_sgd, parallel_sgd, fed_avg}.
An extensive sweep over possible values of $\ell$ is performed on CIFAR100.
The results of this experiment are depicted in Fig. \ref{fig:local_iter}.
As can be seen, \texttt{ResIST} achieves high accuracy even with thousands of local SGD iterations (i.e., previous work typically uses much fewer \citep{use_local_sgd}).
However, if more sub-ResNets are used, performance tends to deteriorate more quickly as local iterations increase.
Due to the robustness of \texttt{ResIST} to large numbers of local iterations, training can be accelerated without deteriorating model performance by simply increasing the value of $\ell$.
Local SGD was found to demonstrate similar robustness to the number of local iterations, as shown in Fig. \ref{fig:local_iter}.



\subsection{Deeper architectures}
\label{S:deep_arch}
The \texttt{ResIST} methodology is easily applicable to deeper architectures.
To demonstrate this, results are replicated for CIFAR10 and CIFAR100 datasets with ResNet152 and ResNet200.
These deeper architectures are identical to the original ResNet101 architecture (i.e., see Fig. \ref{model_depict}).
However, more residual blocks are added to the third section of the ResNet (i.e., the highlighted portion of Fig. \ref{model_depict}) to increase the model's depth.
It should be noted that convolutional blocks within the third section of the ResNet are partitioned in \texttt{ResIST} by default (see Sec. \ref{subnet_sec}). 
As a result, all extra residual blocks within these deeper architectures are partitioned to sub-ResNets by \texttt{ResIST} (i.e., no extra blocks are shared between sub-ResNets), allowing \texttt{ResIST} to achieve greater acceleration in comparison to local SGD. 


The results of experiments with deeper ResNets are presented in Table \ref{tab:deep_net_results}. %\footnote{Due to limitations in computational resources, all models in this section were trained for 80 epochs.}
\texttt{ResIST} performs competitively with localSGD in all cases.
Furthermore, \texttt{ResIST} achieves a significant speedup in comparison to local SGD that becomes more pronounced as the model becomes deeper.
E.g., for 4-GPUs, \texttt{ResIST} completes training $>3 \times$ faster than local SGD for ResNet200 on both datasets.
This speedup is caused by a greater ratio of total network blocks being partitioned to sub-ResNets in \texttt{ResIST}.
While local SGD must communicate all parameters between machines, \texttt{ResIST} achieves a relative decrease in communication by partitioning all extra residual blocks evenly between sub-ResNets.

\subsection{\texttt{ResIST} and Quantization/Sparse Gradients}
\label{S:quant}

Many quantization \citep{commeff-sgd, double-quant} and sparsification \citep{sparse-comm, linear-speed-quant} techniques have been proposed for reducing communication costs in distributed training.
Such techniques focus on compressing communicated data, and they do not interfere with our methodology, which provides a novel approach to model synchronization and training.
The proposed approach can be easily combined with existing compression techniques to further reduce communication costs and accelerate training \emph{with no extra tuning or modifications}.
To demonstrate that \texttt{ResIST} works well with quantization, we compress all communicated parameters using both four-bit and eight-bit compression.
Table \ref{quantization} shows that \texttt{ResIST} retains its performance until the compression level reaches five-bit and lower.
We also perform experiments with sparsification of communicated weights by only keeping 25\% of total weights within each synchronization round. 
Such a strategy reaches a validation peformance of 71.25\% on CIFAR100.
We summarize the results of all quantization experiments in Fig. \ref{fig:budget}, where we compare communication budgets across different compression techniques with \texttt{ResIST}.
From this figure, it is clear that \texttt{ResIST} is most efficient with six-bit quantization and is compatible with most main-stream compression techniques.

% We also experiment with compressing all communicated weight with weight sparsification by only keeping top 25\% weights which reaches 71.25\% in CIFAR100.
% We summarized all results in Figure \ref{fig:budget}, which compares the communication budget across different compression techniques and \texttt{ResIST} variants combined with these techniques.
% It shows \texttt{ResIST}+6 bit quantization is most communication efficient and \texttt{ResIST} is compatible with both main types of compression techniques.

\begin{table}[!htp]
\centering
\caption{Test Accuracy for \texttt{ResIST} combined with quantization on CIFAR10 and CIFAR100 (denoted as C10 and C100).}
\vspace{0.1cm}
\setlength{\tabcolsep}{.5\tabcolsep}
\begin{tabular}{cccccc}
\toprule
    Dataset  & 8 bit & 7 bit & 6 bit & 5 bit & 4 bit\\ \midrule
    C10   & 92.14\% & 92.26\% & 91.91\% & 91.35\%  & 76.33\%\\
    C100  & 71.38\% & 72.15\% & 71.37\% & 68.29\% & 40.48\%\\
  \bottomrule
 \end{tabular}
 \label{quantization}
 \end{table}
 
 \begin{figure}
    \centering
    \includegraphics[width=0.7\linewidth]{images/communication_budget_final.png}
    \caption{Test accuracy vs. communication budget for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100. All models are trained over a 4-GPU cluster.}
    \label{fig:budget}
    %\Description{Test accuracy vs. communication budget for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100.}
    
\end{figure}

\subsection{Further Analysis on Communication Cost Reduction of \texttt{ResIST}}
The significant communication cost reduction of \texttt{ResIST}, in comparison with Local SGD, comes from that fact that (1) it reduces the communication volume at each global synchronization by only communicating subnetworks (2) it has the similar convergence speed in terms of number of global synchronization rounds, as shown in Fig \ref{fig:global_syncrhonization_rounds}. This is also true when \texttt{ResIST} is combined with other compression techniques.

\begin{figure}
    \centering
    \includegraphics[width=0.7\linewidth]{images/global_synchronization_rounds.png}
    \caption{Test accuracy vs. global synchronization rounds for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100. All models are trained over a 4-GPU cluster.}
    \label{fig:global_syncrhonization_rounds}
    %\Description{Test accuracy vs. communication budget for \texttt{ResIST}, \texttt{ResIST}+quantization, \texttt{ResIST}+gradient compression, local SGD and vanilla data parallel on CIFAR100.}
    
\end{figure}

\subsection{Comparison between \texttt{ResIST} and Federated Dropout}
Federated Dropout, a concurrent work with our paper, also explores splitting a global model into smaller ones, in order to achieve acceleration and preserve final accuracy. Yet, we have not found any work in the literature that handles ResNets and residual blocks specifically and utilized similar additional technique we have applied (such as layer scaling in subnetworks as shown in Figure \ref{resist_ablations}). More importantly, Federated Dropout and its variants do not utilize multiple rounds of local subnetwork training before global synchronization. In other words, Federated Dropout variants need to communicate and synchronize at every training iteration. On the other hand, ResIST and its baseline local SGD locally train for a number of iterations (e.g., 50) on its local model before each synchronization. Thus, compared with Federated Dropout, ResIST significantly reduces the communication frequency and accordingly the total communication volume/cost, as we show in the experiments. Further, Federated Dropout and its variants can only support around 25\% dropout rate for each subnetwork, where ResIST, with our additional technique, can support above 50\% dropout rate for each subnetwork. This further reduces the total communication cost, local computation cost and local memory cost. In CIFAR100 experiments with 4 workers, to reach test accuracy of 71\%, the total communication cost/volume for ResIST is 112.32 GB, while for Federated Dropout the cost is 8138.81 GB. In other words, ResIST achieves 72.46x reduction in the communication cost.

\section{Proof for \texttt{ResIST}}{\label{sec:proof}}
Suppose we have $S$ workers, for subnetwork $v$ at local training step $l_t \le \ell$ and global synchronization step $t \le T$:

\begin{align*}
\vect{x}^{(1)}_{v,l_t,t}&=\sqrt{\frac{c_{\sigma}}{m}} \relu{\mat{W}^{(1)}_{v,l_t,t}\vect{x_{v,l_t,t}}}, \nonumber\\
\vect{x}^{(h)}_{v,l_t,t} & =\vect{x}^{(h-1)}_{v,l_t,t}+\frac{c_{res}}{H\sqrt{m}} \relu{\mat{W}^{(h)}_{v,l_t,t}\vect{x}^{(h-1)}_{v,l_t,t}}M^{(h)}_{v,t} \nonumber\\
& ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\text{ for } 2\le h\le H, \nonumber\\
f_{res}(\vect{x},\params)&=\vect{a}_{v,l_t,t}^\top \vect{x}^{(H)}_{v,l_t,t}
\end{align*}
where  $0< c_{res} < 1$ is a small constant and $M^{(h)}_{v,t}$ is random binary variable in layer dropout or the indicator in \texttt{ResIST} that indicates whether this layer is partitioned to this subnetwork. such mask variable is constant during local training steps and re-sampled/re-assigned at global synchronization step. In \texttt{ResIST} and other research on layer dropout for ResNet, last layer is never dropped/paritioned but shared with all workers. Thus, in the following proof, we will follow this setting.Note here we use a $\frac{c_{res}}{H\sqrt{m}}$ scaling. 
We follow the general assumption made in \cite{du2019gradient} on some technical conditions on the activation functions $\sigma$:
There exists a constant $c>0$ such that	$\abs{\relu{0}} \le c$ and for any $z,z' \in \mathbb{R}$,
\begin{align*}
\abs{\relu{z} -\relu{z'} }\le &c\abs{z-z'}, \\
\text{ and }\abs{\sigma'(z)-\sigma'(z)} \le &c \abs{z-z'}.
\end{align*}
Also, we assume $\relu{\cdot}$ is analytic and is not a polynomial function.


In practice, several actication function satisfy the above two assumptions. The guiding example is softplus: $\relu{z} = \log(1+\exp(z))$. For softplus both Lipschitz constant and smoothness constant are $1$.
In this paper, we view all activation function related parameters as constants.

The gradient for subnetwork is 
\begin{align*}
\frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}} =& \frac{c_{res}}{H\sqrt{m}}
\sum_{i=1}^{n}(y_i-u_i)\vect{x}_{i,v,l_t,t}^{(h-1)} \cdot
\left[\vect{a}_{v,l_t,t}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(l)}\mat{W}_{v,l_t,t}^{(l)} M^{(l)}_{v,t} \right) \mat{J}_{i,v,l_t,t}^{(h)}  M^{(h)}_{v,t}\right]
\end{align*}
For subnetwork, $\mat{G}^{(H)}$ has the same form as in layer drop ResNet.

The accumulated gradients of all the subnetworks:
\begin{align*}
\mathcal{W}^{(h)}_{t+1} - \mathcal{W}^{(h)}_{t} &= 
\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}}}{\sum_{v=1}^{S}M^{(h)}_{v,t}} \\
&=\frac{\eta}{\sum_{v=1}^{S}M^{(h)}_{v,t}} \sum_{v=1}^{S} \sum_{l_t=1}^{\ell}\frac{c_{res}}{H\sqrt{m}}
\sum_{i=1}^{n}(y_i-u_i)\vect{x}_{i,v,l_t,t}^{(h-1)} \cdot
\left[\vect{a}_{v,l_t,t}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(l)}\mat{W}_{v,l_t,t}^{(l)} M^{(l)}_{v,t} \right) \mat{J}_{i,v,l_t,t}^{(h)}  M^{(h)}_{v,t}\right]
\end{align*}

 
The whole network at global synchronization step t+1

\begin{align*}
\vect{x}^{(1)}_{t}&=\sqrt{\frac{c_{\sigma}}{m}} \relu{\frac{\sum_{v=1}^{S} \mat{W}^{(1)}_{v,\ell,t}}{S}\vect{x_{t}}}, \nonumber\\
\vect{x}^{(h)}_{t} & =\vect{x}^{(h-1)}_{t}+\frac{c_{res}}{H\sqrt{m}} \relu{\frac{\sum_{v=1}^{S} \mat{W}^{(h)}_{v,\ell,t}M^{(h)}_{v,t}}{\sum_{v=1}^{S}M^{(h)}_{v,t}}\vect{x}^{(h-1)}_{t}} \nonumber\\
& ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\text{ for } 2\le h\le H, \nonumber\\
f_{res}(\vect{x},\params)&=\frac{\sum_{v=1}^{S} \vect{a}_{v,\ell,t}}{S}^\top \vect{x}^{(H)}_{t}
\end{align*}
Let $\mathcal{W}^{(h)}_t=\frac{\sum_{v=1}^{S} \mat{W}^{(h)}_{v,\ell,t}M^{(h)}_{v,t}}{\sum_{v=1}^{S}M^{(h)}_{v,t}}$, $\vect{a}_t=\frac{\sum_{v=1}^{S} \vect{a}_{v,\ell,t}}{S}$

The whole network at global synchronization step 0

\begin{align*}
\vect{x}^{(1)}_{0}&=\sqrt{\frac{c_{\sigma}}{m}} \relu{\mat{W}^{(1)}_{0}\vect{x}}, \nonumber\\
\vect{x}^{(h)}_{0} & =\vect{x}^{(h-1)}_{0}+\frac{c_{res}}{H\sqrt{m}} \relu{\mat{W}^{(h)}_{0}\vect{x}^{(h-1)}_{0}} \nonumber\\
& ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\text{ for } 2\le h\le H, \nonumber\\
f_{res}(\vect{x},\params)&=\vect{a}_0^\top \vect{x}^{(H)}_{0}
\end{align*}

\subsection{Proof Sketch}
We can write the loss of the whole network at global syncrhonization step t+1 as \[
L(\params(t),\mat{M}_{1,t},\mat{M}_{2,t}...\mat{M}_{S,t}) = \frac{1}{2}\norm{\vect{y}-\vect{u}(t, \mat{M}_{1,t},\mat{M}_{2,t}...\mat{M}_{S,t})}_2^2.
\]
where $\mat{M}_{v,t}=\{ M^{(1)}_{v,t},M^{(2)}_{v,t}...M^{(H)}_{v,t} \}$
Let $\mathcal{M}_t=\{ \mat{M}_{1,t},\mat{M}_{2,t}...\mat{M}_{S,t} \}$

For convience, we drop all mask notation in the following proof. Let $\hat{\vect{u}}(t)$ be the output of the whole network at global synchronization step $t+1$.
Now recall the progress of loss function:
\begin{align*}
\norm{\vect{y}-\vect{\hat{u}}(t+1)}_2^2  
= & \norm{\vect{y}-\vect{\hat{u}}(t)}_2^2 -2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top \left(\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)\right) + \norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2
\end{align*}
Following \cite{du2019gradient}, we apply Taylor expansion on $\left(\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)\right)$ and look at the $i$th coordinate.
\begin{align*}
\hat{u}_{i}(t+1)-\hat{u}_{i}(t) & = -\langle \theta(t+1)-\theta(t), \hat{u}'_i\left(\params(t)\right) \rangle + \int_{s=0}^{1} \langle \theta(t+1)-\theta(t), \hat{u}'_i\left(\params(t)\right) -\hat{u}'_i\left(\params(t)-s (\params(t)-\params(t+1)) \right) \rangle ds \\
& \triangleq I^i_1(t)+I_2^i(t)
\end{align*}
However, it is not obvious that $I_1(t)$ and $I_2(t)$ can be directed bounded to show the decrease of the loss of the whole network as both of them involve the accumulated gradient change from distributed local subnetwork training. Thus, we introduce a new term $I'^i_1(t)$ as below, which relates to the hypothetical global gradient direction as if the whole network trained centrally.  
\begin{align*}
	I'^i_1(t)=&-\eta \ell \langle L'(\params(t)), \hat{u}'_i\left(\params(t)\right) \rangle \\
	=&-\eta \ell \sum_{j=1}^{n}(\hat{u}_j-y_j) \langle \hat{u}'_j(\params(t)), \hat{u}'_i\left(\params(t)\right) \rangle\\
	\triangleq& -\eta \ell \sum_{j=1}^{n}(\hat{u}_j-y_j) \sum_{h=1}^{H+1}\mat{\hat{G}}^{(h)}_{ij}(t)
	\end{align*}
Accordingly, 
\begin{align*}
&\norm{\vect{y}-\vect{\hat{u}}(t+1)}_2^2  \\
= & \norm{\vect{y}-\vect{\hat{u}}(t)}_2^2 -2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top \left( \vect{I}_1(t)+\vect{I}_2(t)+\vect{I'}_1(t)-\vect{I'}_1(t) \right) + \norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2\\
= &\norm{\vect{y}-\vect{\hat{u}}(t)}_2^2 - 2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top\vect{I'}_1(t) +2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top(\vect{I'}_1(t)-\vect{I}_1(t))-2\left(\vect{y}-\vect{\hat{u}}(t)\right)^\top\vect{I}_2(t)+  \norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2 \\
\le & \left(1-\eta \ell \lambda_{\min}\left(\mat{\hat{G}}^{(H)}(t)\right)\right)\norm{\vect{y}-\vect{\hat{u}}(t)}_2^2 +2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top(\vect{I'}_1(t)-\vect{I}_1(t))\\
&-2\left(\vect{y}-\vect{\hat{u}}(t)\right)^\top\vect{I}_2(t)+  \norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2.
\end{align*}

% \begin{align*}
% 	I^i_1=&-\langle \mathcal{W}_{t}-\mathcal{W}_{t-1}, \hat{u}'_i\left(\params(t)\right) \rangle \\
% 	\end{align*}

% \begin{align*}
% 	I_2^i(t) =& \int_{s=0}^{1} \langle \mathcal{W}_{t}-\mathcal{W}_{t-1}, \hat{u}'_i\left(\params(t)\right) -\hat{u}'_i\left(\params(t)-s (\params(t)-\params(t+1)) \right) \rangle ds \\
% 	\abs{I_2^i(t)} \le & \max_{0\le s\le 1} \sum_{h=1}^{H}  \norm{\mathcal{W}_{t}-\mathcal{W}_{t-1}}_F \norm{  \hat{u}'^{(h)}_i\left(\params(t)\right) -\hat{u}'^{(h)}_i\left(\params(t)-s (\mathcal{W}_{t-1}-\mathcal{W}_{t})\right)}_F .
% 	\end{align*}

Our hypothesis is:
\begin{condition}\label{cond:linear_converge_resist}
	At the $t+1$-th global synchronization, for the whole network, we have \begin{align*}
	\norm{\vect{y}-\vect{\hat{u}}(t,\mathcal{M}_t)}_2^2 \le (1-\frac{\eta \ell \lambda_0}{2})^{t} \norm{\vect{y}-\vect{\hat{u}}(0)}_2^2.
	\end{align*}
\end{condition}
In order to prove this, we need to show $2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top(\vect{I'}_1(t)-\vect{I}_1(t))$,$-2\left(\vect{y}-\vect{\hat{u}}(t)\right)^\top\vect{I}_2(t)$ and  $\norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2$  are proportional to $\eta^2 \norm{\vect{y}-\vect{\hat{u}}(t)}_2^2$
so if we set $\eta$ sufficiently small, this term is smaller than $\eta \lambda_{\min}\left(\mat{\hat{G}}^{(H)}(t)\right)\norm{\vect{y}-\vect{\hat{u}}(t)}_2^2$ and thus the loss function decreases with a linear rate. 

Further, similar to \cite{du2019gradient}, to prove the induction hypothesis, it suffices to prove $\lambda_{\min}\left(\mat{\hat{G}}^{(H)}(t)\right) \ge \frac{\lambda_0}{2}$ for $t'=0,\ldots,t$, where $\lambda_0$ is independent of $m$.
Similar to \cite{du2019gradient}, we can show  at the beginning 
\begin{align*}
	\lambda_{\min}\left(\mat{\hat{G}}^{(H)}(0)\right) \ge \frac{3}{4}\lambda_0.
\end{align*}

Now for the $t$-th global iteration, by matrix perturbation analysis, we know it is sufficient to show $\norm{\mat{\hat{G}}^{(H)}(t)-\mat{\hat{G}}^{(H)}(0)}_2 \le \frac{1}{4}\lambda_0$.
To do this, we show as long as $m$ is large enough, every weight matrix is close its initialization in a relative error sense. 


\begin{lem}[Lemma on Initialization Norms for the whole network]
	\label{lem:init_norm_res_global}
	If $\sigma(\cdot)$ is $L-$Lipschitz and $m = \Omega\left(\frac{n}{\delta}\right)$, assuming $\norm{\mathcal{W}^{(h)}_0}_2\le c_{w,0}\sqrt{m}$ for $h\in[2,H]$ and $c_{w,0} \approx 2$ for Gaussian initialization. We have with probability at least $1-\delta$ over random initialization, for every $h\in[H]$ and $i \in [n]$, 
	\[ \frac{1}{c_{x,0}}\le \norm{\vect{x}_{i,0}^{(h)}}_2 \le c_{x,0} \]  for some universal constant $c_{x,0} > 1$
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:init_norm_res_global}]
As the global model at initialization is the same with original ResNet in \cite{du2019gradient}, we can use the same proof in Lemma C.1 in \cite{du2019gradient}.
\end{proof}

The following lemma lower bounds $\mat{\hat{G}}^{(H)}(0)$'s least eigenvalue.
\begin{lem}[Least Eigenvalue at the Initialization]\label{lem:resnet_least_eigen_whole}
	If $m = \Omega\left(\frac{n^2\log(Hn/\delta)}{\lambda_0^2}\right)$, we have \begin{align*}
	\lambda_{\min}(\mat{\hat{G}}^{(H)}(0)) \ge \frac{3}{4}\lambda_0.
	\end{align*}
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:resnet_least_eigen_whole}]
As the global model at initialization is the same with original ResNet in \cite{du2019gradient}, we can use the same proof in Lemma C.2 in \cite{du2019gradient}.
\end{proof}

\begin{lem}\label{lem:pertubation_of_neuron_res_sub}
	Suppose $\sigma(\cdot)$ is $L$-Lipschitz and for $h\in[H]$, $\norm{\mathcal{W}^{(h)}_0}_2 \le c_{w,0}\sqrt{m}$, $\norm{\vect{x}^{(h)}_0}_2 \le c_{x,0}$ and $\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F \le \sqrt{m} R$ for some constant $c_{w,0},c_{x,0} > 0$ and $R\le c_{w,0}$ .
	Then we have \begin{align*}
	\norm{\vect{x}^{(h)}_{v,l_t,t}-\vect{x}^{(h)}_0}_2 \le \left(\sqrt{c_{\sigma}}L+\frac{c_{x,0}}{c_{w,0}}+\frac{c_{x,0}}{R}\right)e^{2c_{res}c_{w,0}L} R \triangleq c'_xR.
	\end{align*} 
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:pertubation_of_neuron_res_sub}]
	We prove this lemma by induction.
	Our induction hypothesis is \begin{align*}
	\norm{\vect{x}^{(h)}_{v,l_t,t}-\vect{x}^{(h)}_{0}}_2 \le g(h)  ,
	\end{align*}	where \begin{align*}
	g(h) = \left[1+\frac{2c_{res}c_{w,0}L}{H}\right]g(h-1) + \frac{c_{res}Lc_{x,0}}{H}(c_{w,0}+R).
	\end{align*}
	For $h=1$, we have
	\begin{align*}
	\norm{\vect{x}^{(1)}_{v,l_t,t}-\vect{x}^{(1)}_0}_2&\le \sqrt{\frac{c_{\sigma}}{m}} \norm{\relu{\mat{W}^{(1)}_{v,l_t,t} \vect{x}} -\relu{\mathcal{W}^{(1)}_0 \vect{x}}}_2\\
&	\le \sqrt{\frac{c_{\sigma}}{m}}L\norm{\mat{W}^{(1)}_{v,l_t,t}-\mathcal{W}^{(1)}_0}_F \le \sqrt{c_{\sigma}}LR ,
	\end{align*}
	which implies $g(1)=\sqrt{c_{\sigma}}LR$, for $2\le h\le H$, we have
	\begin{align*}
&	\norm{\vect{x}^{(h)}_{v,l_t,t}-\vect{x}^{(h)}_0}_2 \le \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}}M^{(h)}_{v,t} -\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}1}_2\\&+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
&   \le \frac{c_{res}}{H\sqrt{m}} \norm{[\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}} -\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}]M^{(h)}_{v,t} + \relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}(M^{(h)}_{v,t}-1)}_2\\&+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
&   \le \frac{c_{res}}{H\sqrt{m}} \norm{[\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}} -\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}]M^{(h)}_{v,t}}_2 + \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}(M^{(h)}_{v,t}-1)}_2\\&+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
&   \le \frac{c_{res}}{H\sqrt{m}} \norm{[\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}} -\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}]}_2\norm{M^{(h)}_{v,t}}_2 + \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}}_2\norm{(M^{(h)}_{v,t}-1)}_2\\&+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
&   \le \frac{c_{res}}{H\sqrt{m}} \norm{[\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}} -\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_0} + \relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}}-\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}]}_2 \\&+ \frac{c_{res}L}{H\sqrt{m}} \norm{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
& 	\le \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_{v,l_t,t}} -\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_0}}_2\\ 
	& + \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mat{W}^{(h)}_{v,l_t,t} \vect{x}^{(h-1)}_0} -\relu{\mathcal{W}^{(h)}_0 \vect{x}^{(h-1)}_0}}_2\\ &+\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2+ \frac{c_{res}Lc_{w,0}c_{x,0}}{H}\\
&	\le  \frac{c_{res}L}{H\sqrt{m}}\left(
	\norm{\mathcal{W}^{(h)}_0}_2 + \norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F
	\right) \cdot \norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2 \\
	& + \frac{c_{res}L}{H\sqrt{m}}\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F \norm{\vect{x}^{(h-1)}_0}_2 +\norm{\vect{x}^{(h-1)}_{v,l_t,t}-\vect{x}^{(h-1)}_0}_2+ \frac{c_{res}Lc_{w,0}c_{x,0}}{H}\\
&	\le \left[1+\frac{c_{res}L}{H\sqrt{m}}\left(c_{w,0}\sqrt{m}+R\sqrt{m}\right)\right]g(h-1) + \frac{c_{res}L}{H\sqrt{m}} \sqrt{m} R c_{x,0}+ \frac{c_{res}Lc_{w,0}c_{x,0}}{H}\\
&	\le \left(1+\frac{2c_{res}c_{w,0}L}{H}\right)g(h-1)+\frac{c_{res}}{H} Lc_{x,0}R + \frac{c_{res}Lc_{w,0}c_{x,0}}{H}. 
	\end{align*}
	Lastly, simple calculations show $g(h) \le \left(\sqrt{c_{\sigma}}L+\frac{c_{x,0}}{c_{w,0}}+\frac{c_{x,0}}{R}\right)e^{2c_{res}c_{w,0}L} R$. 
	
\end{proof}

\begin{lem}\label{lem:pertubation_of_neuron_res_whole}
	Suppose $\sigma(\cdot)$ is $L$-Lipschitz and for $h\in[H]$, $\norm{\mathcal{W}^{(h)}_0}_2 \le c_{w,0}\sqrt{m}$, $\norm{\vect{x}^{(h)}_0}_2 \le c_{x,0}$ and $\norm{\mathcal{W}^{(h)}_{t}-\mathcal{W}^{(h)}_0}_F \le \sqrt{m} R$ for some constant $c_{w,0},c_{x,0} > 0$ and $R\le c_{w,0}$ .
	Then we have \begin{align*}
	\norm{\vect{x}^{(h)}_{t}-\vect{x}^{(h)}_0}_2 \le \left(\sqrt{c_{\sigma}}L+\frac{c_{x,0}}{c_{w,0}}\right)e^{2c_{res}c_{w,0}L} R \triangleq c_xR.
	\end{align*} 
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:pertubation_of_neuron_res_whole}]
The proof is exactly the same with proof of C.3 in \cite{du2019gradient}
\end{proof}




Next, we characterize how the perturbation on the weight matrices affect $\mat{\hat{G}}^{(H)}$.
\begin{lem}\label{lem:close_to_init_small_perturbation_res_smooth}
	Suppose $\sigma(\cdot)$ is differentiable, $L-$Lipschitz and $\beta-$smooth. Suppose for $h\in[H]$, $\norm{\mathcal{W}^{(h)}_0}_2\le c_{w,0}\sqrt{m}$, $\norm{\vect{a}_0}_2\le a_{2,0}\sqrt{m}$, $\norm{\vect{a}_0}_4\le a_{4,0}m^{1/4}$ , $\frac{1}{c_{x,0}}\le\norm{\vect{x}^{(h)}_0}_2 \le c_{x,0}$, if $\norm{\mathcal{W}^{(h)}_r-\mathcal{W}^{(h)}_0}_F, \norm{\vect{a}_r-\vect{a}_0}_2\le \sqrt{m}R$ where $R \le c \lambda_0H^2n^{-1}$ and $R\le c$ for some small constant $c$ , we have  \begin{align*}
	\norm{\mat{\hat{G}}^{(H)}(t) - \mat{\hat{G}}^{(H)}(0)}_2 \le \frac{\lambda_0}{2}.
	\end{align*}
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:close_to_init_small_perturbation_res_smooth}]
	Similar to C.4 in \cite{du2019gradient} Because Frobenius-norm of a matrix is bigger than the operator norm, it is sufficient to bound $\norm{\mat{\hat{G}}^{(H)}(t) - \mat{\hat{G}}^{(H)}(0)}_F$. 
	For simplicity define $z_{i,q}(t) = \mathcal{W}_{t,q}^{{(H)}\top} \vect{x}_{i,t}^{(H-1)}$, we have
	\begin{align*}
	&\abs{\mat{\hat{G}}_{i,j}^{(H)}(t)-\mat{\hat{G}}_{i,j}^{(H)}(0)} \\
	= & \frac{c_{res}^2}{H^2m} \big{|}\vect{x}_{i,t}^{(H-1)\top} \vect{x}_{j,t}^{(H-1)}
	  \sum_{q=1}^{m}a_q(t)^2\sigma'\left(z_{i,q}(t)\right)\sigma'\left(z_{j,q}(t)\right)
	\\
	&-\vect{x}_{i,0}^{(H-1)\top} \vect{x}_{j,0}^{(H-1)}
	 \sum_{q=1}^{m}a_q(0)^2\sigma'\left(z_{i,q}(0)\right)\sigma'\left(z_{j,q}(0)\right)
	\big{|} \\
% 	= & \frac{c_{res}^2}{H^2m} \big{|}\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}
% 	  \sum_{t=1}^{m}a_q(t)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	\\
% 	&+\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}
% 	  \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)\\
% 	&-\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}
% 	  \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right) \\
% 	&-\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)
% 	 \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	\\
% 	&+\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)
% 	 \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	\\
% 	&-\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)
% 	 \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(0)\right)\sigma'\left(z_{j,t}(0)\right)
% 	\big{|} \\
% 	= & \frac{c_{res}^2}{H^2m} \big{|}(\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}-\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0))
% 	  \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	\\
% 	&+\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}
% 	  \sum_{t=1}^{m}(a_q(t)^2-a_q(0)^2)\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)\\
% 	&+\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)
% 	\left[ \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)- \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(0)\right)\sigma'\left(z_{j,t}(0)\right)\right]
% 	\big{|} \\
% 	\le & \frac{c_{res}^2}{H^2m}\abs{\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)} - \vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)}  \sum_{t=1}^{m}a_q(0)^2\abs{\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right) }\\
% 	& + \frac{c_{res}^2}{H^2m} \abs{\vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)} \left[ \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)- \sum_{t=1}^{m}a_q(0)^2\sigma'\left(z_{i,t}(0)\right)\sigma'\left(z_{j,t}(0)\right)\right] \\
% 	&+\frac{c_{res}^2}{H^2m} \abs{\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}}  \abs{\sum_{t=1}^{m} \left(a_q(t)^2-a_q(0)^2\right)
% 		\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	} \\
% 		\le & \frac{c_{res}^2}{H^2}L^2a_{2,0}^2\abs{\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)} - \vect{x}_i^{(H-1)}(0)^\top \vect{x}_j^{(H-1)}(0)} \\
% 	&+ \frac{c_{res}^2}{H^2}\frac{c_{x,0}^2}{m} \abs{}\abs{\sum_{t=1}^{m}a_q(0)^2\left(	\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 		- 
% 		\sigma'\left(z_{i,t}(0)\right)\sigma'\left(z_{j,t}(0)\right)\right)} \\
% &+\frac{c_{res}^2}{H^2m} \abs{\vect{x}_{i,t}^{(H-1)}^\top \vect{x}_{j,t}^{(H-1)}}  \abs{\sum_{t=1}^{m} \left(a_q(t)^2-a_q(0)^2\right)
% 		\sigma'\left(z_{i,t}(t)\right)\sigma'\left(z_{j,t}(t)\right)
% 	}\\
\le & \frac{c_{res}^2}{H^2}L^2a_{2,0}^2\abs{\vect{x}_{i,t}^{(H-1)\top} \vect{x}_{j,t}^{(H-1)} - \vect{x}_{i,0}^{(H-1)\top} \vect{x}_{j,0}^{(H-1)}} \\
	&+ \frac{c_{res}^2}{H^2}\frac{c_{x,0}^2}{m}\abs{\sum_{q=1}^{m}a_q(0)^2\left(	\sigma'\left(z_{i,q}(t)\right)\sigma'\left(z_{j,q}(t)\right)
		- 
		\sigma'\left(z_{i,q}(0)\right)\sigma'\left(z_{j,q}(0)\right)\right)} \\
&+\frac{c_{res}^2}{H^2m} \abs{\vect{x}_{i,t}^{(H-1)\top} \vect{x}_{j,t}^{(H-1)}}  \abs{\sum_{q=1}^{m} \left(a_q(t)^2-a_q(0)^2\right)
		\sigma'\left(z_{i,q}(t)\right)\sigma'\left(z_{j,q}(t)\right)
	}\\
	\triangleq& \frac{c_{res}^2}{H^2}(I_1^{i,j} + I_2^{i,j} + I_3^{i,j}).
	\end{align*}
	
	For $I_1^{i,j}$, using Lemma~\ref{lem:pertubation_of_neuron_res_whole}, we have \begin{align*}
	I_1^{i,j} = &L^2a_{2,0}^2\abs{\vect{x}_{i,t}^{(H-1)\top} \vect{x}_{j,t}^{(H-1)} - \vect{x}_{i,0}^{(H-1)\top} \vect{x}_{j,0}^{(H-1)}}  \\
	\le & L^2a_{2,0}^2\abs{
		(\vect{x}_{i,t}^{(H-1)}-\vect{x}_{i,0}^{(H-1)}\top) \vect{x}_{j,t}^{(H-1)}} + L^2a_{2,0}^2\abs{
		\vect{x}_{i,0}^{(H-1)\top}(\vect{x}_{i,t}^{(H-1)}-\vect{x}_{i,0}^{(H-1)})}  \\
	\le & c_{x}L^2 a_{2,0}^2R \cdot (c_{x,0} + c_{x} R) + c_{x,0} c_x L^2a_{2,0}^2R \\
	\le &3 c_{x,0} c_x L^2a_{2,0}^2 R,
	\end{align*}
	
	Same with C.4 \cite{du2019gradient}, to bound $I_{2}^{i,j}$, we have 
	\begin{align*}
	I_2^{i,j} =&c_{x,0}^2 \frac{1}{m} \abs{\sum_{q=1}^{m}
		a_q(0)^2\sigma'\left(z_{i,q}(t)\right)\sigma'\left(z_{j,q}(t)\right)
		- 
		a_q(0)^2\sigma'\left(z_{i,q}(0)\right)\sigma'\left(z_{j,q}(0)\right)
	}\\
% 	\le &c_{x,0}^2 \frac{1}{m}\sum_{t=1}^{m}		a_q(0)^2\abs{\left( \sigma'\left(z_{i,t}(t)\right)-\sigma'\left(z_{i,t}(0)\right) \right)\sigma'\left(z_{j,t}(t)\right)} +	a_q(0)^2\abs{\left( \sigma'\left(z_{j,t}(t)\right)-\sigma'\left(z_{j,t}(0)\right) \right)\sigma'\left(z_{i,t}(0)\right)}\\
% 	\le & \frac{\beta L c_{x,0}^2}{m} \left(
% 	\sum_{t=1}^{m} 	a_q(0)^2\abs{z_{i,t}(t)-z_{i,t}(0)}+	a_q(0)^2\abs{z_{j,t}(t)-z_{j,t}(0)}
% 	\right) \\
	\le & \frac{\beta La_{4,0}^2 c_{x,0}^2}{\sqrt{m}}\left(\sqrt{\sum_{q=1}^{m}\abs{z_{i,q}(t)-z_{i,q}(0)}^2}+\sqrt{\sum_{q=1}^{m}\abs{z_{j,q}(t)-z_{j,q}(0)}^2}\right) .
	\end{align*}
	Using the same proof for Lemma~\ref{lem:pertubation_of_neuron_res_whole}, it is easy to see \begin{align*}
	\sum_{q=1}^{m}\abs{z_{i,q}(t)-z_{i,q}(0)}^2 \le \left(2c_xc_{w,0}+c_{x,0}\right)^2L^2mR^2 .
	\end{align*}
	Thus
	\begin{align*}
	I_2^{i,j}\le 2\beta c_{x,0}^2 \left(2c_xc_{w,0}+c_{x,0}\right)L^2 R .
	\end{align*}
	The bound of $I_3^{i,j}$ is the same to that $I_3^{i,j}$ in \cite{du2019gradient} C.4,
		\begin{align*}
	I_3^{i,j}
	&\le 12L^2c_{x,0}^2 a_{2,0}R.
	\end{align*}
	Therefore we can bound the perturbation\begin{align*}
	\norm{\mat{\hat{G}}^{(H)}(t) - \mat{\hat{G}}^{(H)}(0)}_F=&\sqrt{\sum_{(i,j)}^{{n,n}} \abs{\mat{\hat{G}}_{i,j}^{(H)}(t)-\mat{\hat{G}}_{i,j}^{(H)}(0)}^2} \\
	&\le \frac{c_{res}^2}{H^2}\sqrt{n^2(3 c_{x,0} c_xL^2a_{2,0}^2 R+2\beta c_{x,0}^2 \left(2c_xc_{w,0}+c_{x,0}\right)L^2 R+12L^2c_{x,0}^2 a_{2,0}R}) \\
	&= \frac{c_{res}^2}{H^2}n(3 c_{x,0} c_xL^2a_{2,0}^2 R+2\beta c_{x,0}^2 \left(2c_xc_{w,0}+c_{x,0}\right)L^2 R+12L^2c_{x,0}^2 a_{2,0}R) \\. \\
	\end{align*}
	Plugging in the bound on $R$, we have the desired result.
\end{proof}

Now we prove theorem \ref{thm:resist_gd} by induction, assume the condition \ref{cond:linear_converge_resist}, we want to bound the change of weight to satisfy lemma \ref{lem:close_to_init_small_perturbation_res_smooth} and then we want to show 
$2 \left(\vect{y}-\vect{\hat{u}}(t)\right)^\top(\vect{I'}_1(t)-\vect{I}_1(t))$,$-2\left(\vect{y}-\vect{\hat{u}}(t)\right)^\top\vect{I}_2(t)$ and  $\norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2$  are proportional to $\eta^2 \norm{\vect{y}-\vect{\hat{u}}(t)}_2^2$
so if we set $\eta$ sufficiently small, this term is smaller than $\eta \lambda_{\min}\left(\mat{\hat{G}}^{(H)}(t)\right)\norm{\vect{y}-\vect{\hat{u}}(t)}_2^2$ and thus the loss function decreases with a linear rate.
\begin{lem}\label{lem:dist_from_init_resnet_sub}
	If Condition~\ref{cond:linear_converge_resist} holds for $t'=0,\ldots,t-1$, we have for any $1  \le v \le S$, $0 \le l_t \le \ell$
	\begin{align*}
	&\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F, \norm{\vect{a}_{v,l_t,t}-\vect{a}_0}_2\le  R'\sqrt{m},\\
	&\norm{\mat{W}^{(h)}_{v,l_t,t}-\mat{W}^{(h)}_{v,l_t-1,t}}_F,\norm{\vect{a}_{v,l_t,t}-\vect{a}_{v,l_t-1,t}}_2\le \eta Q'(l_t-1,t),
	\end{align*} where $R'=\frac{16 c_{res}c_{x,0}a_{2,0}Le^{2c_{res}c_{w,0}L} \sqrt{n} \norm{\vect{y}-\vect{u}(0)}_2}{H\lambda_0\sqrt{m}} <c$ for some small constant $c$ , \\$ Q'(l_t,t)= 4c_{res}c_{x,0}a_{2,0}Le^{2c_{res}c_{w,0}L}\sqrt{n} \norm{\vect{y}-\vect{u}_{t,l_t}}_2/H$ and \\$ Q'(t)= 4c_{res}c_{x,0}a_{2,0}Le^{2c_{res}c_{w,0}L}\sqrt{n} \norm{\vect{y}-\vect{\hat{u}}_{t}}_2/H$.
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:dist_from_init_resnet_sub}]
	We will prove this corollary by induction. The induction hypothesis is
		\begin{align*}
	\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F &\le  R'\sqrt{m}\\
	\norm{\vect{a}_{v,l_t,t}-\vect{a}_0}_2 &\le  R'\sqrt{m}.
	\end{align*}
	First we want to prove it holds for $t'=0$ and $0 \le l_t \le \ell$. 
	
	We prove it by induction w.r.t $l_t$:
	It is easy to see that it holds for $t'=0$ and $l_t'=0$.
	Suppose it holds for $0 \le l_t' \le l_t$, we want to prove it holds for $l_t'=l_t+1$
	Following C.5 in \cite{du2019gradient}, note $\norm{\mat{J}_{i,v,l_t,t}^{(k)}}_2 \le L$.
	We have
	\begin{align*} 
	&\norm{\mat{W}^{(h)}_{v,l_t+1,t}-\mat{W}^{(h)}_{v,l_t,t}}_F\\
	\le&  \eta \frac{c_{res}}{H\sqrt{m}} \norm{\vect{a_{v,l_t,t}}}_2 \sum_{i=1}^{n}\abs{y_i-u_{i,v,l_t,t}}\norm{\vect{x}^{(h-1)}_{i,v,l_t,t}}_2 \prod_{k=h+1}^H \norm{\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(k)}\mat{W}^{(k)}_{v,l_t,t}M_{v,t}^{(k)} }_2 \norm{\mat{J}_{i,v,l_t,t}^{(k)}}_2 \norm{M_{v,t}^{(h)}}_2 \\
	\le &\eta \frac{Lc_{res}}{H\sqrt{m}} \norm{\vect{a_{v,l_t,t}}}_2 \sum_{i=1}^{n}\abs{y_i-u_{i,v,l_t,t}}\norm{\vect{x}^{(h-1)}_{i,v,l_t,t}}_2 \prod_{k=h+1}^H \norm{\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(k)}\mat{W}^{(k)}_{v,l_t,t}M_{v,t}^{(k)} }_2  \\
	\end{align*}
	
	Further
	\begin{align*} 
	&\prod_{k=h+1}^H \norm{\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(k)}\mat{W}^{(k)}_{v,l_t,t} M_{v,t}^{(k)}}_2 \\
	\le & \prod_{k=h+1}^H \norm{\mat{I}}_2+\norm{\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(k)}\mat{W}^{(k)}_{v,l_t,t} M_{v,t}^{(k)}}_2 \\
	\le & \prod_{k=h+1}^H \norm{\mat{I}}_2+\frac{c_{res}}{H\sqrt{m}}\norm{\mat{J}_{i,v,l_t,t}^{(k)}}_2\norm{\mat{W}^{(k)}_{v,l_t,t}}_2\norm{ M_{v,t}^{(k)}}_2 \\
	\le & \prod_{k=h+1}^H \norm{\mat{I}}_2+\frac{c_{res}L}{H\sqrt{m}}(\norm{\mathcal{W}^{(k)}_0}_F+\norm{\mat{W}^{(k)}_{v,l_t,t}-\mathcal{W}^{(k)}_0}_F) \\
	\le & \prod_{k=h+1}^H 1+\frac{c_{res}L}{H}(c_{w,0}+R') \\
	\le & \prod_{k=h+1}^H 1+\frac{c_{res}L}{H}2c_{w,0} \\
	\le & e^{2c_{res}x_{w,0}L}
	\end{align*}

	Thus
	\begin{align*} 
	&\norm{\mat{W}^{(h)}_{v,l_t+1,t}-\mat{W}^{(h)}_{v,l_t,t}}_F\\
	\le &\eta \frac{Lc_{res}}{H\sqrt{m}} \norm{\vect{a}_{v,l_t,t}}_2 \sum_{i=1}^{n}\abs{y_i-u_i(s)}\norm{\vect{x}^{(h-1)}_{i,v,l_t,t}}_2 \prod_{k=h+1}^H \norm{\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(k)}\mat{W}^{(k)}_{v,l_t,t} M_{v,t}^{(k)}}_2\\
	\le &\eta \frac{Lc_{res}}{H\sqrt{m}} \norm{\vect{a}_{v,l_t,t}}_2 \sum_{i=1}^{n}\abs{y_i-u_i(s)}\norm{\vect{x}^{(h-1)}_{i,v,l_t,t}}_2 e^{2c_{res}x_{w,0}L}\\
	\le &\eta c_{res}(c_{x,0}+c_{x}R')La_{2,0}e^{2c_{res}c_{w,0}L}\sqrt{n} \norm{\vect{y}-\vect{u}_{l_t,t}}_2/H\\
	\le &3\eta c_{res}c_{x,0}La_{2,0}e^{2c_{res}c_{w,0}L}\sqrt{n} \norm{\vect{y}-\vect{u}_{l_t,t}}_2/H\\
	\le & \eta Q'(l_t,t)\\
	\le& (1-\frac{\eta \lambda_0}{2})^{s/2}\frac{1}{4}\eta \lambda_0 R'\sqrt{m}
	\end{align*}
		Similarly, we have \begin{align*}
	\norm{\vect{a}_{v,l_t+1,t}-\vect{a}_{v,l_t,t}}_2 \le& 3\eta c_{x,0} \sum_{i=1}^{n}\abs{y_i-u_{l_t,t}}\\
	\le& \eta Q'(l_t,t)\\
	\le& (1-\frac{\eta \lambda_0}{2})^{l_t/2}\frac{1}{4}\eta \lambda_0 R'\sqrt{m}.
	\end{align*}
	
	Thus
	\begin{align*}
	&\norm{\mat{W}^{(h)}_{v,l_t+1,t}-\mathcal{W}^{(h)}_0}_F\\
	\le& \norm{\mat{W}^{(h)}_{v,l_t+1,t}-\mathcal{W}^{(h)}_{v,l_t,t}}_F+\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F\\
	\le&\sum_{l_t'=0}^{l_t} \eta (1-\frac{\eta \lambda_0}{2})^{l_t'/2}\frac{1}{4}\eta \lambda_0 R'\sqrt{m}.\\
	\end{align*}
		Similarly,
	\begin{align*}
	&\norm{\vect{a}_{v,l_t+1,t}-\vect{a}_0}_2\\
	\le&\sum_{l_t'=0}^{l_t} \eta (1-\frac{\eta \lambda_0}{2})^{l_t'/2}\frac{1}{4}\eta \lambda_0 R'\sqrt{m}.\\
	\end{align*}
	
	Now suppose the hypothesis hold for t'=0,1..,t and for $0 \le l_t \le \ell$. We want to prove for $t'=t+1$, the hypothesis holds.
	By Lemma ~\ref{lem:weight_sub_whole}, we know $\norm{\mathcal{W}^{(h)}_{t}-\mathcal{W}^{(h)}_0}_F \le \sqrt{m} R'$ 
	Thus, $\norm{\mat{W}^{(h)}_{v,l_t=0,t+1}-\mathcal{W}^{(h)}_0}_F \le \sqrt{m} R'$
	Thus, by using the same induction on $l_t$ above, we can prove the hypothesis for $t+1$.
	
\end{proof}


\begin{lem}\label{lem:weight_sub_whole}
Assume 
\begin{align*}
% &\norm{\mathcal{W}^{(h)}_0}_F\le c_{w,0}\sqrt{m} \\
&\norm{\mat{W}^{(h)}_{v,l_t,t}-\mathcal{W}^{(h)}_0}_F, \norm{\vect{a}_{v,l_t,t}-\vect{a}_0}_2 \le \sqrt{m} R' \\
\end{align*} 
We have
\begin{align*}
\norm{\mathcal{W}^{(h)}_{t}-\mathcal{W}^{(h)}_0}_F,\norm{\vect{a}_{t}-\vect{a}_0}_2 \le \sqrt{m} R'
\end{align*} 

\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:weight_sub_whole}]
\begin{align*}
\norm{\mathcal{W}^{(h)}_{t}-\mathcal{W}^{(h)}_0}_F &= \norm{\frac{\sum_{v=1}^{S} \mat{W}^{(h)}_{v,\ell,t}M^{(h)}_{v,t}}{\sum_{v=1}^{S}M^{(h)}_{v,t}} - \mathcal{W}^{(h)}_0}_F \\
& \le \frac{\sum_{v: M^{(h)}_{v,t}=1} \norm{\mat{W}^{(h)}_{v,\ell,t}-\mathcal{W}^{(h)}_0}_F}{\sum_{v=1}^{S}M^{(h)}_{v,t}} \\
& \le \sqrt{m} R'
\end{align*} 
Similarly,
\begin{align*}
\norm{\vect{a}_{t}-\vect{a}_0}_2 & \le \frac{\sum_{v=1}^{S} \norm{\vect{a}_{v,l_t,t}-\vect{a}_0}_2}{S} \\
& \le \sqrt{m} R'
\end{align*} 
\end{proof}

\begin{lem}\label{lem:dist_from_wholenetwork}
If Condition~\ref{cond:linear_converge_resist} holds for $t'=0,\ldots,t-1$ and $\eta\le c\lambda_0H^2n^{-2} \ell^{-2} S^{-1}$ for some small constant $c$, we have
$\norm{\vect{I'}^{i}_1(t)-\vect{I}^{i}_1(t)}_2 \le  C_{I_1}^{*} \eta^2 \norm{y_i-\hat{u}_{i,t-1}}_2$ where $C_{I_1}^{*}$ is a constant and thus $\norm{\vect{I'}_1(t)-\vect{I}_1(t)}_2 \le \frac{1}{16}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(k)}_2$.
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:dist_from_wholenetwork}]
\begin{align*}
\norm{\vect{I'}^{i}_1(t)-\vect{I}^{i}_1(t)}_2 & =\norm{\langle \eta \ell L'(\params(t))-(\theta(t+1)-\theta(t)), \hat{u}'_i\left(\params(t)\right) \rangle}_2 \\
	& \le \sum_{h=1}^{H} \norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}}}{\sum_{v=1}^{S}M^{(h)}_{v,t}}-\eta \ell \frac{\partial L}{\partial\mathcal{W}_{t-1}^{(h)}}}_F \norm{\hat{u}'_i\left(\params(t\right)}_2 \\
	& + \norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \vect{a}_{v,l_t,t}}}{S}-\eta \ell \frac{\partial L}{\partial\vect{a}_t}}_2 \norm{\hat{u}'_i\left(\params(t\right)}_2
\end{align*}
	Let $M_{t,h}=\sum_{v=1}^{S}M^{(h)}_{v,t}$
	\begin{align*}
	&\norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}}}{\sum_{v=1}^{S}M^{(h)}_{v,t}}-\eta \ell \frac{\partial L}{\partial\mathcal{W}_{t-1}^{(h)}}}_F \\
	\le &   \eta 1/M_{t,h} \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \norm{\frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}}-\frac{\partial L}{\partial\mathcal{W}_{t-1}^{(h)}}}_F \\
	\le & \eta 1/M_{t,h} \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \big{||} \frac{c_{res}}{H\sqrt{m}}
\sum_{i=1}^{n}(y_i-u_{i,v,l_t,t})\vect{x}_{i,v,l_t,t}^{(h-1)} \cdot
\left[\vect{a}_{v,l_t,t}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(l)}\mat{W}_{v,l_t,t}^{(l)} M^{(l)}_{v,t} \right) \mat{J}_{i,v,l_t,t}^{(h)}  M^{(h)}_{v,t}\right] \\
& - \frac{c_{res}}{H\sqrt{m}}
\sum_{i=1}^{n}(y_i-\hat{u}_{i,t-1})\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right] \big{||}_F \\
\le & \eta 1/M_{t,h} \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{c_{res}}{H\sqrt{m}} \sum_{i=1}^{n} \big{||} 
(y_i-u_{i,v,l_t,t})\vect{x}_{i,v,l_t,t}^{(h-1)} \cdot
\left[\vect{a}_{v,l_t,t}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(l)}\mat{W}_{v,l_t,t}^{(l)} M^{(l)}_{v,t} \right) \mat{J}_{i,v,l_t,t}^{(h)}  M^{(h)}_{v,t}\right] \\
& - (y_i-\hat{u}_{i,t-1})\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right] \big{||}_F \\
	\end{align*}
	Through standard calculations, we have
	\begin{align*}
	\norm{	\mathcal{W}_{t-1}^{(l)}-\mat{W}_{v,l_t,t}^{(l)}}_F \le &\eta \ell Q'(0,t),\\
		\norm{	\vect{a}_{t-1}-\vect{a}_{v,l_t,t}}_F \le &\eta \ell Q'(0,t),\\
	\norm{	\vect{x}_{i,t-1}^{(h-1)}-\vect{x}_{i,v,l_t,t}^{(h-1)}}_F \le &\eta \ell c'_x \frac{Q'(0,t)}{\sqrt{m}},\\
	\norm{	\mat{J}_{i,t-1}^{(l)}-\mat{J}_{i,v,l_t,t}^{(l)}}_F \le &2 \ell \left(c_{x,0}+c_{w,0}c'_x\right)\eta \beta   Q'(0,t),
	\end{align*}
	where $c'_x\triangleq\left(\sqrt{c_{\sigma}}L+\frac{c_{x,0}}{c_{w,0}}+\frac{c_{x,0}}{R}\right)e^{2c_{res}c_{w,0}L}$.
As we know $\norm{y_i-u_{i,v,l_t,t}} \le \norm{y_i-\hat{u}_{i,t-1}} $, suppose $\norm{u_{i,v,l_t,t}-\hat{u}_{i,t-1}} \le C_u $
	
	According to Lemma G.1 in \cite{du2019gradient}, we have
	\begin{align*}
	& \eta 1/M_{t,h} \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{c_{res}}{H\sqrt{m}} \sum_{i=1}^{n} \big{||} 
(y_i-u_{i,v,l_t,t})\vect{x}_{i,v,l_t,t}^{(h-1)} \cdot
\left[\vect{a}_{v,l_t,t}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,v,l_t,t}^{(l)}\mat{W}_{v,l_t,t}^{(l)} M^{(l)}_{v,t} \right) \mat{J}_{i,v,l_t,t}^{(h)}  M^{(h)}_{v,t}\right] \\
& - (y_i-\hat{u}_{i,t-1})\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right] \big{||}_F \\
	\le & \eta 1/M_{t,h} S \ell n 
	\frac{4}{H}c_{res}c_{x,0}La_{2,0}e^{2Lc_{w,0}}(C_u \\& +\eta \ell\frac{Q'(0,t)}{\sqrt{m}}\left(\frac{c_x}{c_{x,0}}+\frac{2}{L}\left(c_{x,0}+c_{w,0}c_x\right)\beta \sqrt{m}+4c_{w,0}\left(c_{x,0}+c_{w,0}c_x\right)\beta+L+1 \right))\norm{y_i-\hat{u}_{i,t-1}}_2\\
	\end{align*}
On the other hand,
\begin{align*}
    \norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \vect{a}_{v,l_t,t}}}{S}-\eta \ell \frac{\partial L}{\partial\vect{a}_t}}_2 & \le   \eta 1/S \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \norm{\frac{\partial L}{\partial \vect{a}_{v,l_t,t}}-\frac{\partial L}{\partial\vect{a}_t}}_2 \\
    & \le \eta 1/S \sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \sum_{i=1}^{n} \norm{(y_i-u_{i,v,l_t,t})\vect{x}_{i,v,l_t,t}^{(H)}-(y_i-\hat{u}_{i,t})\vect{x}_{i,t-1}^{(H)}}_2 \\
    & \le \eta \ell n (C_{u}+\eta \ell c'_x \frac{Q'(0,t)}{\sqrt{m}}) \norm{y_i-\hat{u}_{i,t-1}}_2
\end{align*}
Also,
\begin{align*}
\norm{\hat{u}'_i\left(\params(t)\right)}_2 & \le \frac{c_{res}}{H\sqrt{m}}\sum_{h=1}^H \norm{\frac{\partial \hat{u}_i\left(\params(t)\right)}{\partial \mathcal{W}^{(h)}_{t-1}}}_2 \\
& = \frac{c_{res}}{H\sqrt{m}}\sum_{h=1}^H \norm{\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right]}_2 \\
& \le  \frac{c_{res}}{H\sqrt{m}}\sum_{h=1}^H \norm{\vect{x}_{i,t-1}^{(h-1)}}_2 \norm{\vect{a}_{t-1}}_2 \norm{\prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right)}_2 \norm{\mat{J}_{i,t-1}^{(h)}}_2 \\
& \le \frac{c_{res}}{H}  H 2 c_{x,0} a_{2,0} L e^{2c_{res}x_{w,0}L} \\
& = 2c_{res}c_{x,0} a_{2,0} L e^{2c_{res}x_{w,0}L}
\end{align*}

Thus, combine all above and also according to Lemma \ref{lem:dist_u}
\begin{align*}
	\norm{\vect{I'}^{i}_1(t)-\vect{I}^{i}_1(t)}_2 & \le \sum_{h=1}^{H} \norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \mat{W}^{(h)}_{v,l_t,t}}}{\sum_{v=1}^{S}M^{(h)}_{v,t}}-\eta \ell \frac{\partial L}{\partial\mathcal{W}_{t-1}^{(h)}}}_2 \norm{\hat{u}'_i\left(\params(t\right)}_2 \\
	& + \norm{\eta \frac{\sum_{v=1}^{S} \sum_{l_t=1}^{\ell} \frac{\partial L}{\partial \vect{a}_{v,l_t,t}}}{S}-\eta \ell \frac{\partial L}{\partial\vect{a}_t}}_2 \norm{\hat{u}'_i\left(\params(t\right)}_2 \\
	& \le C_{I_1}^{*} \eta^2 \norm{y_i-\hat{u}_{i,t-1}} \text{where $C_{I_1}^{*}$ is a constant}
\end{align*}
Using the bound on $\eta$ and following \cite{du2019gradient} $\norm{\vect{y}-\vect{\hat{u}}}_2=O(\sqrt{n})$,
\begin{align*}
\norm{\vect{I'}_1(t)-\vect{I}_1(t)} \le \frac{1}{16}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(k)}_2
\end{align*}
\end{proof}

\begin{lem}\label{lem:dist_u}
	\begin{align*}
	\norm{u_{i,v,l_t,t}-\hat{u}_{i,t}}_2 \le \eta \ell Q'(0,t) B \text{ where B is a constant}
	\end{align*}
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:dist_u}]
\begin{align*}
\norm{u_{i,v,l_t,t}-\hat{u}_{i,t}}_2 &= \norm{\vect{a}_{v,l_t,t}^\top \vect{x}^{(H)}_{v,l_t,t}-\vect{a}_{t}^\top \vect{x}^{(H)}_{t}}_2 \\
& \le \eta (2 a_{2,0}  3 c_{x,0}  \ell Q'(0,t) (1 + \frac{c_x}{\sqrt{m}}))
\end{align*}
\end{proof}

\begin{lem}\label{lem:i2}
If Condition~\ref{cond:linear_converge_resist} holds for $t'=0,\ldots,t-1$ and $\eta\le c\lambda_0H^2n^{-2} \ell^{-2} S^{-1}$ for some small constant $c$, we have
$\norm{\vect{I}_2(t)}_2 \le  C_{I_2}^{*} \eta^2 \norm{y_i-\hat{u}_{i,t-1}}_2$ where $C_{I_2}^{*}$ is a constant and thus $\norm{\vect{I}_2(t)}_2 \le \frac{1}{8}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(k)}_2$.
\end{lem}

\begin{proof}[Proof of Leamma~\ref{lem:i2}]
\begin{align*}
I_2^i(t) =& \int_{s=0}^{1} \langle \theta(t+1)-\theta(t), \hat{u}'_i\left(\params(t)\right) -\hat{u}'_i\left(\params(t)-s (\params(t)-\params(t+1)) \right) \rangle ds
\end{align*}
Define for $1 \le h \le H$
\begin{align*}
    \hat{u}'^{(h)}_i\left(\params(t)\right) = \frac{\partial \hat{u}(\params(t)) }{\mathcal{W}_{t}^{(h)}}
\end{align*}
And 
\begin{align*}
\hat{u}'^{(H+1)}_i\left(\params(t)\right) = \frac{\partial \hat{u}(\params(t)) }{\vect{a}_t}
\end{align*}
\begin{align*}
	\abs{I_2^i(t)} \le  \max_{0\le s\le 1} & \sum_{h=1}^{H}  \norm{\mathcal{W}_{t}^{(h)}-\mathcal{W}_{t-1}^{(h)}}_F \norm{  \hat{u}'^{(h)}_i\left(\params(t)\right) -\hat{u}'^{(h)}_i\left(\params(t)-s (\theta(t+1)-\theta(t))\right)}_F\\
	& + \norm{\vect{a}_t-\vect{a}_{t-1}}_2 \norm{\hat{u}'^{(H+1)}_i\left(\params(t)\right) -\hat{u}'^{(H+1)}_i\left(\params(t)-s (\theta(t+1)-\theta(t))\right)}_2.
	\end{align*}
From Lemma \ref{lem:dist_from_wholenetwork} and Lemma \ref{lem:dist_from_init_resnet_sub}, 
\begin{align*}
\norm{\mathcal{W}_{t}^{(h)}-\mathcal{W}_{t-1}^{(h)}}_F & \le \eta \ell \hat{Q}'(t-1) \\
\norm{\vect{a}_t-\vect{a}_{t-1}}_2 & \le \eta \ell \hat{Q}'(t-1)
\end{align*}

Let $\vect{x}_{i,t-1,s}^{(l)}$ be the activation of global network with $\mathcal{W}_{t-1,s}=\mathcal{W}_{t-1}-s(\mathcal{W}_{t-1}-\mathcal{W}_{t})$. We similarly define $\mat{J}_{i,t-1,s}^{(l)}$ and $\vect{a}_{t-1,s}$
\begin{align*}
& \norm{  \hat{u}'^{(h)}_i\left(\params(t)\right) -\hat{u}'^{(h)}_i\left(\params(t)-s (\mathcal{W}_{t-1}-\mathcal{W}_{t})\right)}_F \\
\le & \frac{c_{res}}{H\sqrt{m}} \big{||}\vect{x}_{i,t-1,s}^{(h-1)} \cdot
\left[\vect{a}_{t-1,s}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1,s}^{(l)}\mathcal{W}_{t-1,s}^{(l)}  \right) \mat{J}_{i,t-1,s}^{(h)}  \right] \\
& -\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right]\big{||}_F \\
\end{align*}

Through similar calculation in Lemma \ref{lem:dist_from_wholenetwork},
\begin{align*}
\norm{\mathcal{W}_{t-1,s}^{(l)}-\mathcal{W}_{t-1}^{(l)}}_F = & s\norm{(\mathcal{W}_{t-1}^{(l)}-\mathcal{W}_{t}^{(l)})}_F \\
\le &  \norm{(\mathcal{W}_{t-1}^{(l)}-\mathcal{W}_{t}^{(l)})}_F \\
\le & \eta \ell \hat{Q}'(t-1)
\end{align*}
\begin{align*}
\norm{\vect{x}_{i,t-1,s}^{(l)}-\vect{x}_{i,t-1}^{(l)}}_2 & \le \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1,s}} -\relu{\mathcal{W}^{(l)}_{t-1} \vect{x}^{(l-1)}_{t-1}}}_2+\norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2 \\
& \le  \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1,s}} -\relu{\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1}}}_2 \\ & +\frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1}} -\relu{\mathcal{W}^{(l)}_{t-1} \vect{x}^{(l-1)}_{t-1}}}_2 + \norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2 \\
& \le \frac{c_{res}L}{H\sqrt{m}} \norm{\mathcal{W}_{t-1,s}^{(l)}}_F\norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2 \\
& + \frac{c_{res}L}{H\sqrt{m}} \norm{\mathcal{W}_{t-1,s}^{(l)}-\mathcal{W}_{t-1}^{(l)}}_F \norm{\vect{x}^{(l-1)}_{t-1}}_2 + \norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2 \\
& \le (1 +  \frac{c_{res}L}{H\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m}+\eta \ell \hat{Q}'(t-1))) \norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2 \\
& + \frac{c_{res}L}{H\sqrt{m}} \eta \ell \hat{Q}'(t-1)(c_x R'+c_{x,0}) \\
\end{align*}
Also
\begin{align*}
\norm{\vect{x}_{i,t-1,s}^{(0)}-\vect{x}_{i,t-1}^{(0)}}_2 & = \frac{c_{res}}{H\sqrt{m}} \norm{\relu{\mathcal{W}^{(1)}_{t-1,s} \vect{x}_i} -\relu{\mathcal{W}^{(0)}_{t-1} \vect{x}_i}}_2 \\
& \le \frac{c_{res}}{H\sqrt{m}} L \norm{\mathcal{W}^{(1)}_{t-1,s}  -\mathcal{W}^{(0)}_{t-1} }_2 \\
& \le \frac{c_{res}}{H\sqrt{m}} L \eta \ell \hat{Q}'(t-1)
\end{align*}
Thus
\begin{align*}
\norm{\vect{x}_{i,t-1,s}^{(l)}-\vect{x}_{i,t-1}^{(l)}}_2 & \le (\frac{c_{res}}{H\sqrt{m}} L \eta \ell \hat{Q}'(t-1)+\frac{\frac{c_{res}L}{H\sqrt{m}} \eta \ell \hat{Q}'(t-1)(c_x R'+c_{x,0})}{\frac{c_{res}L}{H\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m})}) e^{\frac{c_{res}L}{H\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m}+\eta \ell \hat{Q}'(t-1))} \\
& \le \eta \ell \hat{Q}'(t-1) (\frac{c_{res}}{H\sqrt{m}} L+\frac{ (c_x R'+c_{x,0})}{ (c_{w,0}\sqrt{m}+R'\sqrt{m})})e^{\frac{c_{res}L}{\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m}+\eta \ell \hat{Q}'(t-1))} \\
& \triangleq \eta \ell \hat{Q}'(t-1) C^{*}_x
\end{align*}
Similarly, through standard calculation we can get
\begin{align*}
\norm{\vect{a}_{t-1,s}-\vect{a}_{t-1}}_2 \le \eta \ell \hat{Q}'(t-1)
\end{align*}
Lastly,
\begin{align*}
\norm{\mat{J}_{i,t-1,s}^{(l)}-\mat{J}_{i,t-1}^{(l)}}_2 & = \norm{\sigma'(\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1,s})-\sigma'(\mathcal{W}^{(l)}_{t-1} \vect{x}^{(l-1)}_{t-1})}_2 \\
& \le \beta \norm{\mathcal{W}^{(l)}_{t-1,s} \vect{x}^{(l-1)}_{t-1,s}-\mathcal{W}^{(l)}_{t-1} \vect{x}^{(l-1)}_{t-1}}_2 \\
& \le \beta (\norm{\mathcal{W}_{t-1,s}^{(l)}}_F\norm{\vect{x}^{(l-1)}_{t-1,s}-\vect{x}^{(l-1)}_{t-1}}_2+\norm{\mathcal{W}_{t-1,s}^{(l)}-\mathcal{W}_{t-1}^{(l)}}_F \norm{\vect{x}^{(l-1)}_{t-1}}_2) \\
& \le \beta (\frac{c_{res}L}{H\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m}+\eta \ell \hat{Q}'(t-1))\eta \ell \hat{Q}'(t-1) C^{*}_x + \eta \ell \hat{Q}'(t-1)(c_x R'+c_{x,0})) \\
& = \eta \ell \hat{Q}'(t-1) \beta (\frac{c_{res}L}{H\sqrt{m}} (c_{w,0}\sqrt{m}+R'\sqrt{m}+\eta \ell \hat{Q}'(t-1)) C^{*}_x + (c_x R'+c_{x,0})) \\
& \triangleq \eta \ell \hat{Q}'(t-1) \beta  C^{*}_J
\end{align*}
Thus, according to Lemma G.1 in \cite{du2019gradient}, we have
\begin{align*}
& \norm{  \hat{u}'^{(h)}_i\left(\params(t)\right) -\hat{u}'^{(h)}_i\left(\params(t)-s (\mathcal{W}_{t-1}-\mathcal{W}_{t})\right)}_F \\
\le & \frac{c_{res}}{H\sqrt{m}} \big{||}\vect{x}_{i,t-1,s}^{(h-1)} \cdot
\left[\vect{a}_{t-1,s}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1,s}^{(l)}\mathcal{W}_{t-1,s}^{(l)}  \right) \mat{J}_{i,t-1,s}^{(h)}  \right] \\
& -\vect{x}_{i,t-1}^{(h-1)} \cdot
\left[\vect{a}_{t-1}^\top \prod_{l=h+1}^{H}\left(\mat{I}+\frac{c_{res}}{H\sqrt{m}}\mat{J}_{i,t-1}^{(l)}\mathcal{W}_{t-1}^{(l)}  \right) \mat{J}_{i,t-1}^{(h)}  \right]\big{||} \\
& \le \eta \ell \hat{Q}'(t-1) \frac{c_{res}}{H \sqrt{m}} 2 c_{x,0} 2 a_{2,0} L e^{2Lc_{w,0}}  (\frac{C^{*}_x}{2 c_{x,0}}+\frac{1}{2a_{2,0}}+\frac{c_{res}}{\sqrt{m}}\beta C^{*}_J )
\end{align*}
On the other hand
\begin{align*}
    \norm{\hat{u}'^{(H+1)}_i\left(\params(t)\right) -\hat{u}'^{(H+1)}_i\left(\params(t)-s (\theta(t+1)-\theta(t))\right)}_2 & \le \norm{\vect{x}^{(H)}_{t-1,s}-\vect{x}^{(H)}_{t-1}}_2 \\
    & \le \eta \ell \hat{Q}'(t-1) C^{*}_x
\end{align*}

In the end,
\begin{align*}
\norm{\vect{I}_2(t)}_2 & \le \eta \ell \hat{Q}'(t-1) \eta \ell \hat{Q}'(t-1) (\frac{c_{res}}{\sqrt{m}} 2 c_{x,0} 2 a_{2,0} L e^{2Lc_{w,0}}  (\frac{C^{*}_x}{2 c_{x,0}}+\frac{1}{2a_{2,0}}+\frac{c_{res}}{\sqrt{m}}\beta C^{*}_J )+  C^{*}_x)\\
& \le \eta^2 \ell^2 \hat{Q}'(t-1)^2  (\frac{c_{res}}{\sqrt{m}} 2 c_{x,0} 2 a_{2,0} L e^{2Lc_{w,0}}  (\frac{C^{*}_x}{2 c_{x,0}}+\frac{1}{2a_{2,0}}+\frac{c_{res}}{\sqrt{m}}\beta C^{*}_J )+C^{*}_x)\\
& \le  \eta^2 C_{I_2}^{*} \norm{\vect{y}-\vect{\hat{u}}(t)}_2 \\
& \le \frac{1}{16}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(t)}_2
\end{align*}
\end{proof}
\begin{lem}\label{lem:quadratic_resnet_whole}
If Condition~\ref{cond:linear_converge_resist} holds for $t'=0,\ldots,t-1$ and $\eta\le c\lambda_0H^2n^{-2} \ell^{-2} S^{-1}$ for some small constant $c$, we have
$\norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2 \le \frac{1}{16}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(k)}_2^2$.
\end{lem}

\begin{proof}[Proof of Lemma~\ref{lem:quadratic_resnet_whole}]
	\begin{align*}
\norm{\vect{\hat{u}}(t+1)-\vect{\hat{u}}(t)}_2^2 = & \sum_{i=1}^{n}\left(\vect{a}_{t+1}^\top \vect{x}_{i,t+1}^{(H)}-\vect{a}_{t}^\top \vect{x}_{i,t}^{(H)}\right)^2 \\
= & \sum_{i=1}^{n}\left(\left[\vect{a}_{t+1}-\vect{a}_{t}\right]^\top \vect{x}_{i,t+1}^{(H)}+\vect{a}_{t}^\top \left[\vect{x}_{i,t+1}^{(H)}-\vect{x}_{i,t}^{(H)}\right] \right)^2 \\
\le &2\norm{\vect{a}_{t+1}-\vect{a}_{t}}_2^2\sum_{i=1}^{n}\norm{\vect{x}_{i,t+1}^{(H)}}_2^2+2\norm{\vect{a}_{t}}_2^2\sum_{i=1}^{n}\norm{\vect{x}_{i,t+1}^{(H)}-\vect{x}_{i,t}^{(H)}}_2^2\\
\le &18n\eta^2 \ell^2 c_{x,0}^2Q'(t)^2+4 n \left(\eta  \ell a_{2,0}c_xQ'(t)\right)^2\\
\le &\frac{1}{8}\eta \lambda_0 \norm{\vect{y}-\vect{\hat{u}}(t)}_2^2.
\end{align*}
\end{proof}
\bibliography{dun_187.bib}
\end{document}