% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language Wfor creating drawings and diagrams

% For algorithm
% \usepackage{algorithmic}
\usepackage{algorithm}
\usepackage[compatible]{algpseudocode} % or \usepackage{algcompatible}
\renewcommand{\algorithmiccomment}[1]{\hfill$\triangleright$\textit{\mdseries{#1}}}

% For theorems and such
\usepackage{amsmath}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\usepackage{bm}
\usepackage{amssymb}

\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{color}
% For fig
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{arydshln} 


\usepackage{multirow}


\makeatletter
\def\adl@drawiv#1#2#3{%
        \hskip.5\tabcolsep
        \xleaders#3{#2.5\@tempdimb #1{1}#2.5\@tempdimb}%
                #2\z@ plus1fil minus1fil\relax
        \hskip.5\tabcolsep}
\newcommand{\cdashlinelr}[1]{%
  \noalign{\vskip\aboverulesep
           \global\let\@dashdrawstore\adl@draw
           \global\let\adl@draw\adl@drawiv}
  \cdashline{#1}
  \noalign{\global\let\adl@draw\@dashdrawstore
           \vskip\belowrulesep}}
\makeatother

% \usepackage{multirow}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Superposing Many Tickets into One: \\ A Performance Booster for Sparse Neural Network Training  \\ Supplementary Materia}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% Add authors
\author[1]{Lu Yin}
\author[1]{Vlado Menkovski}
\author[1]{Meng Fang}
\author[1]{Tianjin Huang}
\author[1]{Yulong Pei}
\author[1]{Mykola Pechenizkiy} 
\author[2,1]{\\Decebal Constantin Mocanu}
\author[1]{Shiwei Liu}
% Add affiliations after the authors
\affil[1]{%
    Eindhoven University of Technology \\
    Eindhoven, the Netherlands
}
\affil[2]{%
    University of Twente\\
    Enschede, the Netherlands
}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Lu Yin}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Vlado Menkovski}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Meng Fang}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Tianjin Huang}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Yulong Pei}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Mykola Pechenizkiy}{}}\\
% \author[1,2]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Decebal Constantin Mocanu}{}}
% \author[1]{\href{mailto:<l.yin@tue.nl>?Subject=Your UAI 2022 paper}{Shiwei Liu}{}}
% Add affiliations after the authors

  \begin{document}
 
\onecolumn
\maketitle




\appendix



% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}


\section{Experimental Results of Wide ResNet28-10 on CIFAR-10/100}
\label{sec:WRN2810}


\begin{table*}[h]
\centering
\caption{Test accuracy (\%) of sparse Wide ResNet28-10 on CIFAR-10/100. All the results are averaged from three random runs. In each setting, the best results are marked in bold.}
\label{table:WRN_CIFAR}
\resizebox{.9\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Dataset}     & \multicolumn{3}{c}{CIFAR-10} & \multicolumn{3}{c}{CIFAR-100}  \\ 
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
\textbf{Wide ResNet28-10 }~(Dense) 
& 96.00$\pm$0.13  & - & - 
& 81.09$\pm$0.19  & - & - 
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
Sparsity     & 95\%      & 90\%     & 80\%     
     &  95\%      & 90\%     & 80\%         \\ 
     
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}


SET~\citep{mocanu2018scalable}
& \textbf{95.63$\pm$0.08}  & 95.85$\pm$0.02   & 95.92$\pm$0.25  
& 79.36$\pm$0.14  & 80.44$\pm$0.18   & 80.60$\pm$0.07 
\\
SET+Sup-tickets (ours)
& 95.53$\pm$0.11  & \textbf{95.91$\pm$0.14}  &\textbf{95.93$\pm$0.10}
& \textbf{79.66$\pm$0.18}  & \textbf{80.65$\pm$0.04}   & \textbf{80.91$\pm$0.20}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

RigL~\citep{evci2020rigging}
& 95.70$\pm$0.07 & 95.96$\pm$0.12  &  {96.12$\pm$0.05}
& 79.41$\pm$0.24 & 80.45$\pm$0.45 & 80.92$\pm$0.20
\\
RigL+Sup-tickets (ours)
&\textbf{95.90$\pm$0.11} & \textbf{95.98$\pm$0.06}  &  {\textbf{96.15$\pm$0.08}}
&\textbf{80.00$\pm$0.15}  & \textbf{80.72$\pm$0.22} &  {\textbf{81.16$\pm$0.09}}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

GraNet~\citep{liu2021neuroregeneration}  
& 95.95$\pm$0.08 &  {96.02$\pm$0.01} &  {\textbf{96.09$\pm$0.07}}
&  80.43$\pm$0.17 & 80.97$\pm$0.16 &  {81.31$\pm$0.09}
\\
GraNet+Sup-tickets (ours)
& {\textbf{96.03$\pm$0.11}} &  {\textbf{96.13$\pm$0.07}} &  {96.08$\pm$0.04}
&\textbf{80.65$\pm$0.06} &  {\textbf{81.20$\pm$0.09}} &  {\textbf{81.42$\pm$0.18}}
\\
\cmidrule[\heavyrulewidth](lr){1-7}
\end{tabular}}
\end{table*}



\section{Impact of the Cheap Tickets without training time constraint  }
\label{sec:ticket_count}

{We extend the overall training time to yield 9 tickets. All the cheap tickets have been trained for 8 epochs. The results on CIFAR-100 are reported below. All results are averaged from 3 random runs. As shown, the performance of Sup-tickets continuously improves as the number of tickets increases. }
 
 
\begin{table}[htbp]
\centering
\caption{{Test accuracy (\%) on CIFAR-100 of Sup-tickets combined with RigL under different cheap ticket count. The best results are marked in bold. }}


\label{table:ticket_count}
\resizebox{0.35\textwidth}{!}{
\begin{tabular}{lccc}
\cmidrule[\heavyrulewidth](lr){1-4}

{\textbf{Ticket}}  & \multicolumn{3}{c}{Sparsity } \\   
\cmidrule(lr){2-4}

{\textbf{count}} & 95\%      & 90\%     & 80\%         
 \\ 
\cmidrule[\heavyrulewidth](lr){1-4}

\multicolumn{4}{c}{VGG-16} 
\\
\cmidrule[\heavyrulewidth](lr){1-4}
N=3
 &  71.47$\pm$0.29 &  72.86$\pm$0.22 &  73.42$\pm$0.21
\\
N=6
 &  71.79$\pm$0.10 &  73.19$\pm$0.23 &  73.69$\pm$0.36
\\
N=9
 &  \textbf{71.92$\pm$0.07} &  \textbf{73.30$\pm$0.26} &  \textbf{74.00$\pm$0.38}
\\
\cmidrule[\heavyrulewidth](lr){1-4}
\multicolumn{4}{c}{ResNet-50} 
\\
\cmidrule[\heavyrulewidth](lr){1-4}
N=3
 &  77.14$\pm$0.57 &  77.84$\pm$0.21 &  78.08$\pm$0.40
\\
N=6
 &  77.53$\pm$0.55 &  78.12$\pm$0.32 &  78.18$\pm$0.49
\\
N=9
 & \textbf{77.57$\pm$0.55} &  \textbf{78.15$\pm$0.20} &  \textbf{78.19$\pm$0.46}
\\
\cmidrule[\heavyrulewidth](lr){1-4}
\end{tabular}
}

\end{table}


 \newpage
\section{the variance of the multiple cheap tickets }


The variance of the cheap tickets obtained by our method is quite low, as shown in the following table. To ensure good final performance, we expect all the subnetworks to be located in the same low-loss basin with similar performance. On the other hand, high variance means that cheap tickets are located in different basins, and weight averaging will not bring performance gains. To verify this hypothesis, we generate 3 cheap subnetworks under 95\% sparsity on CIFAR-100 with high variance by using different prune/grow criteria: prune with high magnitude and grow with high gradient, prune with low magnitude and grow randomly, prune with low magnitude and grow randomly.The results are reported in Table~\ref{table:dif variance}.


{We find that averaging subnetworks with high variance significantly hurt the performance, likely due to the fact that they are not from the same loss basin. }





\begin{table}[!ht]
\centering
\caption{{ Accuracy (\%)  of each ticket and the averaged ticket under different variance.  }}
\label{table:dif variance}

\resizebox{0.7\textwidth}{!}{
\begin{tabular}{lcccc}
\toprule

Model & Setting &  {Accuracy of Each Ticket}  &   Variance   & Averaged Accuracy \\
\midrule

ResNet-50  & High Variance                    & [69.44, 76.52, 61.50]  & 6.13 &  2.04 \\ 
          & Low Variance (ours)              & [77.15, 77.56, 77.07] & 0.21 & 77.87\\ 
\midrule
VGG-16  & High Variance                    &[64.02, 70.01, 58.76] & 4.60 &  2.14 \\ 
          & Low Variance (ours)              & [70.31 ,70.15, 70.32] & 0.08 & 71.19\\
\bottomrule
\end{tabular} }
\end{table}

\section{Comparison with LARGE LEARNING RATE schedule}
\label{sec:large lr}



We perform experiments in which the learning rate will immediately increase to a very large value of 0.1 at the beginning of each cycle. We expect that the large learning rate will force the cheap tickets to jump out of the current basin, and the weight averaging does not bring any performance gains. The results in Table~\ref{table:large_lr} are perfectly in line with our expectations. Parameter averaging significantly degrades the accuracy to 10\% $\sim$ 30\%, even though the accuracy of each subnetwork is still high. Besides, if we generate tickets with different prune/grow criteria, they are also likely located in the different basins and dramatically hurt the performance.


\begin{table}[!ht]
\centering
\caption{{{Results of Sup-ticket under large restarting learning rate (0.1) and small restarting learning rate (0.005).}}}

\label{table:large_lr}
\resizebox{0.5\textwidth}{!}{
\begin{tabular}{lcccc}
\toprule

Model & Setting &  {Accuracy of Each Ticket}     & Averaged Accuracy \\
\midrule

ResNet-50  & Large LR schedule                    &  [76.05, 76.37, 76.97]   &  10.37 \\ 
          & Low LR schedule (ours)              & [77.15, 77.56, 77.07] & 77.87\\ 
\midrule
VGG-16  & Large LR schedule                      &[68.68, 69.29, 70.36] &  31.74 \\ 
          & Low LR schedule (ours)               & [70.31 ,70.15, 70.32] & 71.19\\
\bottomrule
\end{tabular} }

\end{table}


\newpage
\section{Implementation Details  of Sup-Tickets}
\label{sec:implementation}
In this appendix, we report the implementation details for Sup-tickets, including:  total training epochs (T-epochs), epochs of normal sparse training (N-epochs), epochs of cheap tickets generation (C-epochs), length of per cyclical learning rate schedule (C), learning rate (LR), batch size (BS),  learning rate drop (LR Drop), the lowest learning rate of cyclical learning rate schedule (LR-$\alpha_1$), the largest learning rate of cyclical learning rate schedule (LR-$\alpha_2$), weight decay (WD), produced tickets count (Ticket Count), SGD momentum (Momentum), sparse initialization (Sparse Init), etc.



\subsection{Implementation Details for CIFAR-10/100}

 
 
\begin{table*}[!ht]
\centering
\caption{Implementation hyperparameters of Sup-tickets on CIFAR-10/100}
\label{tab:hypo_hyper_cifar}
\resizebox{1.0\textwidth}{!}{
\begin{tabular}{cccccccccccccccc}
\toprule
Model & T-epochs & N-epochs & C-epochs &C&  BS & LR  & LR Drop, Epochs & LR-$\alpha_2$ & LR-$\alpha_1$ & Ticket Count & Optimizer & WD  & Momentum & Sparse Init & \\ 
\toprule
VGG-16 &  250 & 226 & 24 & 8 & 128  & 0.1 & 10x, [113, 169]  & 0.001 & 0.005   & 3  & SGD &0.9 &5e-4 & ERK  \\
ResNet-50 &  250 & 226 & 24 & 8 &128  & 0.1 & 10x, [113, 169]  & 0.001 & 0.005   & 3  & SGD &0.9 &5e-4 & ERK  \\
Wide ResNet28-10 &  250 & 226 & 24  & 8 & 128  & 0.1 & 10x, [113, 169]  & 0.001 & 0.005   & 3  & SGD &0.9 &5e-4 & ERK  \\
\bottomrule
\end{tabular}}
\end{table*}


\subsection{Implementation Details for ImageNet}

\begin{table*}[!ht]
\centering
\caption{Implementation hyperparameters of Sup-tickets on ImageNet}
\label{tab:hypo_hyper_imgnet}
\resizebox{1.0\textwidth}{!}{
\begin{tabular}{cccccccccccccccc}
\toprule
Model & T-epochs & N-epochs & C-epochs &C & BS & LR  & LR Drop, Epochs & LR-$\alpha_2$ & LR-$\alpha_1$ & Ticket Count & Optimizer & WD & Momentum & Sparse Init & \\ 
\toprule
ResNet-50 &  100 & 92 & 8 &2 & 64  & 0.1 & 10x, [30, 60, 85]  & 0.0001 & 0.0005   & 4  & SGD &0.9 &1e-4 & ERK  \\
\bottomrule
\end{tabular}}
\end{table*}









\newpage
\section{Comparison between different batch normalization updating strategies.}
\label{sec:bn}
In this section, we compare the test accuracy between two batch normalization updating strategies: (1) using additional running pass over the training data; (2) retrieving the statistic by averaging across each cheap ticket (ours). From Table ~\ref{table:imagenet batch normalization imagenet} and Table ~\ref{table:batch normalization cifar},  we find that there is no obvious difference in test accuracy between these two methods. However, our method could save extra computation resources without the additional running pass.



\begin{table}[htbp]
\centering
\caption{Test accuracy (\%) of different batch normalization updating strategies for ResNet 50 on ImageNet. BU stands for batch normalization updating using additional running pass over the data. AV means averaging across each cheap ticket (ours). In each setting, the best results are marked in bold.}
\label{table:imagenet batch normalization imagenet}
\resizebox{0.35\textwidth}{!}{
\begin{tabular}{lcc}
\cmidrule[\heavyrulewidth](lr){1-3}
\textbf{Dataset}     & \multicolumn{2}{c}{ImageNet}  \\ 
\cmidrule[\heavyrulewidth](lr){1-3}
Sparsity      & 90\%     & 80\%        \\ 
\cmidrule(lr){1-1}
\cmidrule(lr){2-3}
RigL+Sup-tickets (AV)
&74.044 & \textbf{75.966}
\\
RigL+Sup-tickets (BU)
&\textbf{74.083}	  &75.925	
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-3}
GraNet+Sup-tickets (AV)
&74.554  & \textbf{76.168} 
\\
GraNet+Sup-tickets (BU)
&\textbf{74.560}	  & 76.109
\\
\cmidrule[\heavyrulewidth](lr){1-3} 
\end{tabular} }
\end{table}



% \begin{table*}[htbp]
% \centering
% \caption{Comparison between test accuracy with and without batch normalization updates on ImageNet (include last model)}
% \label{table:imagenet batch normalization imagenet}
% \resizebox{0.4\textwidth}{!}{
% \begin{tabular}{lcc}
% \cmidrule[\heavyrulewidth](lr){1-3}
% \textbf{Dataset}     & \multicolumn{2}{c}{ImageNet}  \\ 
% \cmidrule[\heavyrulewidth](lr){1-3}
% Sparsity      & 90\%     & 80\%        \\ 
% \cmidrule(lr){1-1}
% \cmidrule(lr){2-3}
% RigL+Sup-tickets
% &74.003 & 75.968	
% \\
% RigL+Sup-tickets (BU)
% &\textbf{74.059}	  &\textbf{75.962}		
% \\
% \cmidrule(lr){1-1}
% \cmidrule(lr){2-3}
% GraNet+Sup-tickets
% & \textbf{74.554}  & 76.082	
% \\
% GraNet+Sup-tickets (BU)
% &74.538	 &  \textbf{76.074}
% \\
% \cmidrule[\heavyrulewidth](lr){1-3} 
% \end{tabular} }
% \end{table*}

\begin{table*}[htbp]
\centering
\caption{Test accuracy (\%) of different batch normalization updating strategies on CIFAR-10/100.  BU stands for batch normalization updating using additional running pass over the data. AV means averaging across each cheap ticket (ours). In each setting, the best results are marked in bold.}
\label{table:batch normalization cifar}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Dataset}     & \multicolumn{3}{c}{CIFAR-10} & \multicolumn{3}{c}{CIFAR-100}  \\ 
 \cmidrule(lr){1-7}


Sparsity     & 95\%      & 90\%     & 80\%     
     &  95\%      & 90\%     & 80\%         \\ 
     
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

\textbf{VGG-16}~(Dense) 
& 93.91$\pm$0.26   & - & - 
& 73.61$\pm$0.45  & - & - 
\\
SET+Sup-tickets (AV)
&{93.22$\pm$0.09}  & \textbf{93.63$\pm$0.05} & {93.80$\pm$0.13}
&{71.18$\pm$0.29}  & \textbf{71.99$\pm$0.27} & {73.02$\pm$0.32}
\\
SET+Sup-tickets (BU)
&{93.22$\pm$0.12}  & {93.62$\pm$0.01} & {93.80$\pm$0.01}
&\textbf{71.30$\pm$0.26}  & {71.96$\pm$0.19} & \textbf{73.04$\pm$0.31}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

RigL+Sup-tickets (AV)
&{93.20$\pm$0.13}  &  {93.81$\pm$0.11}  & {93.85$\pm$0.25}
&{71.31$\pm$0.21} & {72.57$\pm$0.29} & {73.61$\pm$0.11}
\\
RigL+Sup-tickets (BU)
&\textbf{93.24$\pm$0.11}  &  \textbf{93.86$\pm$0.15}  & \textbf{93.88$\pm$0.28}
&\textbf{71.36$\pm$0.16} & \textbf{72.60$\pm$0.27} & \textbf{73.68$\pm$0.16}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
GraNet+Sup-tickets (AV)
&{94.10$\pm$0.06} & \textbf{94.13$\pm$0.12} & {94.24$\pm$0.05}
& { 73.61$\pm$0.24}& \textbf{73.87$\pm$0.26} & {73.95$\pm$0.30}
\\
GraNet+Sup-tickets (BU)
&\textbf{94.14$\pm$0.06} & {94.10$\pm$0.14} & \textbf{94.25$\pm$0.07}
& \textbf{ 73.71$\pm$0.21}& {73.79$\pm$0.21} & \textbf{74.03$\pm$0.27}
\\
\cmidrule[\heavyrulewidth](lr){1-7}
\textbf{Wide ResNet28-10 }~(Dense) 
& 96.00$\pm$0.13  & - & - 
& 81.09$\pm$0.19  & - & - 
\\
SET+Sup-tickets (AV)
& 95.53$\pm$0.11  & {95.91$\pm$0.14}  &{95.92$\pm$0.10}
& \textbf{79.66$\pm$0.18}  & \textbf{80.65$\pm$0.04}   & \textbf{80.91$\pm$0.20}
\\
SET+Sup-tickets (BU)
&  \textbf{95.59$\pm$0.11}  & \textbf{95.98$\pm$0.08}  &\textbf{95.97$\pm$0.06}
&79.36$\pm$0.35  & {80.47$\pm$0.05}   & {80.74$\pm$0.21}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

RigL+Sup-tickets (AV)
&\textbf{95.90$\pm$0.11} & \textbf{95.98$\pm$0.06}  & {96.15$\pm$0.08}
&\textbf{80.00$\pm$0.15}  & \textbf{80.72$\pm$0.22} & \textbf{81.16$\pm$0.09}
\\
RigL+Sup-tickets (BU)
&{95.88$\pm$0.10} & {95.97$\pm$0.04}  & \textbf{96.17$\pm$0.11}
&{79.76$\pm$0.23}  & {80.52$\pm$0.20} & {81.13$\pm$0.15}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

GraNet+Sup-tickets (AV)
&\textbf{96.03$\pm$0.11} & 96.13$\pm$0.07 & 96.08$\pm$0.04
&{80.65$\pm$0.06} & \textbf{81.20$\pm$0.09} & \textbf{81.42$\pm$0.18}
\\

GraNet+Sup-tickets (BU)
&{96.01$\pm$0.07} & \textbf{96.19$\pm$0.08} & \textbf{96.14$\pm$0.09}
&\textbf{80.73$\pm$0.04} & {81.17$\pm$0.13} & {81.39$\pm$0.21}
\\

\cmidrule[\heavyrulewidth](lr){1-7}

\textbf{ ResNet-50 }~(Dense) 
& 94.88$\pm$0.11 & - & - 
& 78.00$\pm$0.40  & - & - 
\\
\cmidrule[\heavyrulewidth](lr){1-7}

SNIP+Sup-tickets (AV)
& {94.33$\pm$0.09} & {95.05$\pm$0.22} & {95.21$\pm$0.09}
& \textbf{65.56$\pm$1.15} & {76.34$\pm$0.27} & \textbf{77.43$\pm$0.53}
\\
SNIP+Sup-tickets (BU)
& \textbf{94.39$\pm$0.06} & \textbf{95.10$\pm$0.12} & \textbf{95.30$\pm$0.02}
& {65.51$\pm$0.83} & \textbf{76.62$\pm$0.23} & {77.35$\pm$0.62}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
ERK+Sup-tickets (AV)
& {93.92$\pm$0.04} & {94.80$\pm$0.06} & {95.11$\pm$0.27}
& {75.75$\pm$0.28} & {76.82$\pm$0.08} & \textbf{77.85$\pm$0.42}
\\
ERK+Sup-tickets (BU)
& \textbf{93.99$\pm$0.08} & \textbf{94.87$\pm$0.04} & \textbf{95.18$\pm$0.27}
& \textbf{76.02$\pm$0.22} & \textbf{77.01$\pm$0.17} & {77.80$\pm$0.54}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}


SET+Sup-tickets (AV)
&{94.81$\pm$0.05}   & {94.87$\pm$0.03}  & \textbf{94.90$\pm$0.27}
&\textbf{ 76.68$\pm$0.38}  & {77.89$\pm$0.45}   & 78.35$\pm$0.18
\\
SET+Sup-tickets (BU)
&\textbf{94.85$\pm$0.03}   & \textbf{94.97$\pm$0.05}  & {94.86$\pm$0.20}
&{ 76.54$\pm$0.41}  & \textbf{77.93$\pm$0.50}   & \textbf{78.38$\pm$0.18}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

RigL+Sup-tickets (AV)
& \textbf{94.65$\pm$0.11}  & {94.82$\pm$0.13} &  \textbf{94.81$\pm$0.15}
&\textbf{77.58$\pm$0.47}  &  \textbf{78.52$\pm$0.39}  &   \textbf{78.69$\pm$0.30}
\\
RigL+Sup-tickets (BU)
&{94.64$\pm$0.13}  &  \textbf{94.89$\pm$0.09}  &   {94.79$\pm$0.17}
&{77.54$\pm$0.53}  &  {78.43$\pm$0.40}  &   {78.53$\pm$0.31}
\\

\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}


GraNet+Sup-tickets (AV)
&{94.89$\pm$0.15} & {95.08$\pm$0.08} & {94.94$\pm$0.03}
&{77.70$\pm$0.47} & {78.37$\pm$0.53}  & \textbf{78.95$\pm$0.33}


\\
GraNet+Sup-tickets (BU)
&\textbf{94.91$\pm$0.19} & \textbf{95.16$\pm$0.14} & \textbf{95.09$\pm$0.03}
&\textbf{77.82$\pm$0.60} & \textbf{78.63$\pm$0.64}  & {78.07$\pm$0.32}

\\

\cmidrule[\heavyrulewidth](lr){1-7}
%\bottomrule
\end{tabular}
}



\end{table*}
 




 \newpage
\section{Layer-wise Sparsity of ResNet-50 on ImageNet}

Table~\ref{tab:res50sparsity} summarizes the final sparsity budgets for 90\% sparse ResNet-50 on ImageNet-1K obtained by various methods. Backbone represents the sparsity budgets for all the CNN layers without the last fully-connected layer.

\begin{table}[!ht]
\centering
\caption{ResNet-50 Learnt Budgets and Backbone Sparsities at Sparsity 90\%
}
\label{tab:res50sparsity}
\resizebox{0.8\columnwidth}{!}{
\begin{tabular}{@{}l|rr|cccccccc@{}}
\toprule
\multirow{2}{*}{Metric}          & \multicolumn{1}{c}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Fully Dense \\ Params\end{tabular}}} & \multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Fully Dense \\ FLOPs\end{tabular}}} & \multicolumn{7}{c}{Sparsity (\%)}                     \\ \cmidrule(l){4-11} 
                                 & \multicolumn{1}{c}{}                                                                               & \multicolumn{1}{c|}{}                                                                              &  GraNet+Sup-tickets & GraNet  & RigL+Sup-tickets &RigL   \\ \midrule
Overall & 25502912 & 8178569216 & 89.99 & 89.98 & 90.23   & 90.00  \\
Backbone & 23454912 & 8174272512 & 89.89 & 90.65 & 92.47 
& 90.00 
\\ \midrule
Layer 1 - conv1 & 9408 & 118013952 & 37.40 & 38.22 & 57.26 & 58.32
\\
Layer 2 - layer1.0.conv1 & 4096 & 236027904 & 40.55 & 41.70 & 14.58 & 9.40
\\
Layer 3 - layer1.0.conv2 & 36864 & 231211008 & 64.88 & 65.05 & 82.13 & 82.40
\\
Layer 4 - layer1.0.conv3 & 16384 & 102760448 & 64.69 & 65.09 & 17.13 & 16.41
\\
Layer 5 - layer1.0.downsample.0 & 16384 & 102760448 & 74.75 & 74.99 & 29.10 & 24.25
\\
Layer 6 - layer1.1.conv1 & 16384 & 102760448 & 66.33 & 66.75 & 19.72 & 19.02
\\
Layer 7 - layer1.1.conv2 & 36864 & 231211008 & 62.25 & 62.62 & 82.05 & 82.44
\\
Layer 8 - layer1.1.conv3 & 16384 & 102760448 & 57.99 & 58.57 & 4.79 & 4.07
\\
Layer 9 - layer1.2.conv1 & 16384  & 102760448  & 60.15 & 60.60 & 4.85 & 4.19
\\
Layer 10 - layer1.2.conv2 & 36864 & 231211008 & 57.15 & 57.45 & 81.73 & 82.06
\\
Layer 11 - layer1.2.conv3 & 16384 & 102760448  & 57.10 & 57.47 & 5.13 & 3.88   
\\
Layer 12 - layer2.0.conv1 & 32768 & 205520896 & 49.90 & 50.42 & 41.61 & 42.37
\\
Layer 13 - layer2.0.conv2 & 147456  & 231211008 & 69.44 & 69.49 & 91.09 & 91.25 
\\
Layer 14 - layer2.0.conv3 & 65536 & 102760448 & 60.42 & 60.74 & 51.43 & 51.98
\\
Layer 15 - layer2.0.downsample.0 & 131072 & 205520896 & 87.23 & 87.26 & 71.36 & 71.27
\\
Layer 16 - layer2.1.conv1 & 65536  & 102760448  & 84.79 & 84.91 & 52.47 & 52.40  
\\
Layer 17 - layer2.1.conv2  & 147456 & 231211008  & 83.03 & 83.07 & 91.25 & 91.34
\\
Layer 18 - layer2.1.conv3  & 65536 & 102760448 & 70.03 & 70.25 & 52.06 & 52.43 
\\
Layer 19 - layer2.2.conv1 & 65536& 102760448 & 79.47 & 79.61 & 52.07 & 52.25
\\
Layer 20 - layer2.2.conv2  & 147456 & 231211008 & 81.78 & 81.82 & 91.28 & 91.38
\\
Layer 21 - layer2.2.conv3  & 65536 & 102760448 & 73.76 & 73.92 & 51.76 & 51.95
\\
Layer 22 - layer2.3.conv1 & 65536 & 102760448 & 74.82 & 74.97 & 51.92 & 52.24 
\\
Layer 23 - layer2.3.conv2 & 147456 & 231211008 & 82.78 & 82.81 & 91.22 & 91.33 
\\
Layer 24 - layer2.3.conv3 & 65536 & 102760448  & 76.61 & 76.73 & 51.86 & 52.01 
\\
Layer 25 - layer3.0.conv1 & 131072 & 205520896 & 60.53 & 60.81 & 70.98 & 71.39
\\
Layer 26 - layer3.0.conv2 & 589824 & 231211008 & 83.45 & 83.41 & 95.66 & 95.72  
\\
Layer 27 - layer3.0.conv3 & 262144 & 102760448 & 69.56 & 69.73 & 75.77 & 76.06
\\
Layer 28 - layer3.0.downsample.0 & 524288 & 205520896 & 95.24 & 95.21 & 85.79 & 85.64
\\
Layer 29 - layer3.1.conv1 & 262144 & 102760448 & 91.19 & 91.22 & 76.02 & 76.03
\\
Layer 30 - layer3.1.conv2 & 589824 & 231211008 & 92.86 & 92.87 & 95.68 & 95.73
\\
Layer 31 - layer3.1.conv3 & 262144 & 102760448  & 80.70 & 80.81 & 75.76 & 75.95  
\\
Layer 32 - layer3.2.conv1 & 262144 & 102760448  & 90.34 & 90.40 & 76.09 & 76.18  
\\
Layer 33 - layer3.2.conv2 & 589824 & 231211008 & 93.22 & 93.24 & 95.68 & 95.73 
\\
Layer 34 - layer3.2.conv3 & 262144 & 102760448 & 83.42 & 83.47 & 76.06 & 76.21  
\\
Layer 35 - layer3.3.conv1 & 262144 & 102760448 & 89.12 & 89.17 & 76.14 & 76.23
\\
Layer 36 - layer3.3.conv2 & 589824 & 231211008 & 93.20 & 93.21 & 95.67 & 95.71
\\
Layer 37 - layer3.3.conv3 & 262144 & 102760448 & 86.26 & 86.30 & 76.13 & 76.24
\\
Layer 38 - layer3.4.conv1 & 262144 & 102760448  & 88.64 & 88.70 & 75.85 & 75.97
\\
Layer 39 - layer3.4.conv2 & 589824 & 231211008 & 94.50 & 94.51 & 95.65 & 95.69 
\\
Layer 40 - layer3.4.conv3 & 262144 & 102760448 & 87.05 & 87.09 & 75.94 & 76.05
\\
Layer 41 - layer3.5.conv1 & 262144 & 102760448 & 87.10 & 87.15 & 75.91 & 76.07
\\
Layer 42 - layer3.5.conv2 & 589824 & 231211008 & 95.13 & 95.14 & 95.69 & 95.72
\\
Layer 43 - layer3.5.conv3 & 262144 & 102760448 & 88.91 & 88.95 & 76.06 & 76.14 
\\
Layer 44 - layer4.0.conv1 & 524288 & 205520896  & 72.04 & 72.13 & 85.54 & 85.67 
\\
Layer 45 - layer4.0.conv2 & 2359296 & 231211008 & 93.56 & 93.53 & 97.84 & 97.86
\\
Layer 46 - layer4.0.conv3 & 1048576 & 51380224 & 82.00 & 82.01 & 88.01 & 88.09 
\\
Layer 47 - layer4.0.downsample.0 & 2097152 & 205520896 & 99.25 & 99.24 & 92.96 & 92.84 
\\
Layer 48 - layer4.1.conv1 & 1048576 & 102760448 & 95.73 & 95.74 & 88.02 & 88.07
\\
Layer 49 - layer4.1.conv2 & 2359296 & 231211008  & 97.39 & 97.39 & 97.86 & 97.87
\\
Layer 50 - layer4.1.conv3 & 1048576 & 102760448 & 91.08 & 91.07 & 88.10 & 88.12
\\
Layer 51 - layer4.2.conv1 & 1048576 & 205520896 & 87.68 & 87.70 & 87.99 & 88.04  
\\
Layer 52 - layer4.2.conv2 & 2359296 & 231211008  & 97.02 & 97.01 & 97.86 & 97.86
\\
Layer 53 - layer4.2.conv3 & 1048576 & 102760448 & 84.54 & 84.50 & 88.07 & 88.07
\\
Layer 54 - fc & 2048000 & 4096000 & 82.70 & 82.54 & 92.78 & 92.74
\\ 
\bottomrule
\end{tabular}}
\end{table}




\newpage
\section{Comparison with outputs ensemble and knowledge distillation}
\label{sec:deep ensemble}
This appendix compares our approach with the prediction ensemble (averaging prediction of subnetworks). For deep ensemble, we use the same procedure to generate cheap tickets as in Sup-tickets; but instead of averaging their weights and connection topology, we save all the cheap tickets in memory and average their softmax outputs at inference stage~\citep{huang2017snapshot,garipov2018loss}. 

The results are reported in Table~\ref{table:test_ensemble_cifar}  $\&$ Table~\ref{table:test_ensemble_imagnetnet}. Across extensive settings, we observe that our sup-tickets could closely match the strong baseline of output averaging. Worth noting that compared with the latter, our method does not require performing multiple forward passes for prediction nor saving all the ensemble members.



\begin{table*}[htbp]
\centering
\caption{{\textbf{Comparison with prediction ensemble.} Test accuracy (\%) of Sup-tickets and naive deep ensemble on CIFAR10/100.}}
\label{table:test_ensemble_cifar}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Dataset}     & \multicolumn{3}{c}{CIFAR-10} & \multicolumn{3}{c}{CIFAR-100}  \\ 
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
 


Sparsity     & 95\%      & 90\%     & 80\%     
     &  95\%      & 90\%     & 80\%         \\ 

\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
VGG-16
\\ 
\cmidrule(lr){1-7}
RigL + Prediction Ensemble
& \textbf{93.25$\pm$0.18} &  \textbf{93.82$\pm$0.09} & \textbf{93.97$\pm$0.18}
& \textbf{71.80$\pm$0.24} & \textbf{73.07$\pm$0.34} & \textbf{73.80$\pm$0.21}
\\
RigL + Sup-tickets (ours)
&{93.20$\pm$0.13}  &  {93.81$\pm$0.11}  & {93.85$\pm$0.25}
&{71.31$\pm$0.21} & {72.57$\pm$0.29} &  {{73.61$\pm$0.11}}
\\
\cmidrule(lr){1-7}
ResNet-50
\\ 
\cmidrule(lr){1-7}
RigL + Prediction Ensemble
 &94.64$\pm$0.12 & \textbf{94.94$\pm$0.06} & \textbf{94.86$\pm$0.25 }
 &\textbf{77.66$\pm$0.4} & \textbf{78.54$\pm$0.41} & 78.67$\pm$0.25
\\
RigL + Sup-tickets (ours)
& \textbf{94.65$\pm$0.11}  & {94.82$\pm$0.13} &  {94.81$\pm$0.15}
&{77.58$\pm$0.47}  &  {{78.52$\pm$0.39}}  &    \textbf{78.69$\pm$0.30}
\\
\cmidrule(lr){1-7}
\\




\end{tabular}}
\end{table*}



\begin{table}[htbp]
\centering
\caption{Test accuracy (\%) of Sup-tickets and naive deep ensemble for  ResNet-50 on ImageNet. In each setting, the best results are marked in bold.}
\label{table:test_ensemble_imagnetnet}
\resizebox{0.35\textwidth}{!}{
\begin{tabular}{lcc}
\cmidrule[\heavyrulewidth](lr){1-3}
\textbf{Dataset}     & \multicolumn{2}{c}{ImageNet}  \\ 
\cmidrule[\heavyrulewidth](lr){1-3}
Sparsity      & 90\%     & 80\%        \\ 
\cmidrule(lr){1-1}
\cmidrule(lr){2-3}
RigL+Sup-tickets(Ours)
&74.044 & 75.966
\\
RigL+Ensemble
&\textbf{74.074}	  &\textbf{76.022}	
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-3}
GraNet+Sup-tickets(Ours)
&74.554  & 76.168
\\
GraNet+Ensemble
&\textbf{74.614}	  & \textbf{76.198}
\\
\cmidrule[\heavyrulewidth](lr){1-3} 
\end{tabular} }
\end{table}







{Besides, we also apply knowledge distillation~\cite{hinton2015distilling} to distill the knowledge of three sup-tickets into a sparse student model. Each soft loss from the teacher model and the hard loss from the real label have equal weight in the final loss. Compared with knowledge distillation, we do not need to save all the sub-models as teacher models and do not need an extra round of training. Below we report the test accuracy of sparse VGG-16 on CIFAR-10/100. All the results are averaged from 3 random runs. Our method achieves higher accuracy (11 out of 12 cases) than the knowledge distillation based method.}



\begin{table*}[htbp]
\centering
\caption{{\textbf{Comparison with knowledge distillation.} Test accuracy (\%) of Sup-tickets and knowledge distillation (KD). In each setting, the best results are marked in bold.}}
\label{table:kd}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Dataset}     & \multicolumn{3}{c}{CIFAR-10} & \multicolumn{3}{c}{CIFAR-100}  \\ 
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
 


Sparsity     & 95\%      & 90\%     & 80\%     
     &  95\%      & 90\%     & 80\%         \\ 
     
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

SET+KD
& 93.13$\pm$0.06 & 93.56$\pm$0.16 & 93.53$\pm$0.10
& 70.73$\pm$0.18 & 71.79$\pm$0.42 & \textbf{73.06$\pm$0.02}
\\
SET+Sup-tickets (ours)
&\textbf{93.22$\pm$0.09}  & \textbf{93.63$\pm$0.05} & \textbf{93.80$\pm$0.13} 
&\textbf{71.18$\pm$0.29}  & \textbf{71.99$\pm$0.27} & {73.02$\pm$0.32}
\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
RigL+KD
&92.98$\pm$0.15 & 93.38$\pm$0.14 & 93.61$\pm$0.15
&70.89$\pm$0.35 & 72.16$\pm$0.21 & 72.76$\pm$0.09
\\
RigL+Sup-tickets (ours)
&\textbf{93.20$\pm$0.13}  &  \textbf{93.81$\pm$0.11}  & \textbf{93.85$\pm$0.25}
&\textbf{71.31$\pm$0.21} & \textbf{72.57$\pm$0.29} &  {\textbf{73.61$\pm$0.11}}
\\
\cmidrule(lr){1-7}
\\

\end{tabular}}
\end{table*}




\newpage
\section{Statistical Significance}


{We analyze the statistical significance of the results obtained by Sup-tickets. To measure this, we perform Kolmogorov-Smirnov test~\citep{berger2014kolmogorov} (KS-test). The null hypothesis is that the two independent results/samples are drawn from the same continuous distribution. If the p-value is very small (p-value  \textless  0.05), it suggests that the difference between the two sets of results is significant, and the hypothesis is rejected. Otherwise, the obtained results are close together, and the hypothesis is true. We run the experiment on sparse VGG-16, CIFAR-10/100 for 15 runs with different random seeds and report the mean accuracy,  P-value, and decision of significance below. }

\begin{table*}[htbp]
\centering
\caption{{\textbf{Statistical Significance Analysis.}}}

\label{table:statistical_sig}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Dataset}     & \multicolumn{3}{c}{CIFAR-10} & \multicolumn{3}{c}{CIFAR-100}  \\ 
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
 


Sparsity     & 95\%      & 90\%     & 80\%     
     &  95\%      & 90\%     & 80\%         \\ 
     
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

SET
& 92.99 $\pm$0.16 & 93.41$\pm$0.20   & 93.65$\pm$0.15
& 70.50$\pm$0.31 & 71.55$\pm$0.38  & 72.76$\pm$0.21
\\
SET+Sup-tickets (ours)
&\textbf{93.17$\pm$0.16}  & \textbf{93.65$\pm$0.15} & \textbf{93.91$\pm$0.20} &
\textbf{71.18$\pm$0.27}  & \textbf{72.21$\pm$0.29} & \textbf{73.38$\pm$0.29}
\\
P-value 
& 5.90e-2 & 1.87e-2 & 1.02e-2
& 1.88e-05 & 1.02e-2 & 1.87e-2 

\\
Statistically significant  
& No &  Yes  & Yes
& Yes &  Yes &  Yes

\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
RigL
& 92.94$\pm$0.20 & 93.41$\pm$0.14   &93.56$\pm$0.10
& 70.74$\pm$0.33   & 71.97$\pm$0.32  & 72.76$\pm$0.33 
\\
RigL+Sup-tickets (ours)
&\textbf{93.35$\pm$0.18}  &  \textbf{93.69$\pm$0.08}  & \textbf{93.85$\pm$0.12}
&\textbf{71.41$\pm$0.29} & \textbf{72.63$\pm$0.23} &  {\textbf{73.26$\pm$0.29}}

\\
P-value 
&1.63e-4 & 1.88e-05 & 1.88e-05
&1.02e-3 & 1.4e-06 & 1.87e-2

\\
Statistically significant  
& Yes &  Yes  & Yes
& Yes &  Yes &  Yes

\\
\cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}

\\


\end{tabular}}
\end{table*}




\section{comparison with SWA}

{Compared with SWA~\cite{izmailov2018averaging}, our approach provides two advantages. First of all, our method is much more training efficient as it only requires training a subset of the network during the whole training process. On the contrary, SWA requires to fully train a dense network even if it can be pruned afterward. Second, our method can efficiently discover and average \textit{multiple sparse sub-networks with different connectivity}, whereas SWA can only average sparse subnetworks with the same sparse connectivity. Different from dense neural networks where the connectivities are fixed, numerous sparse sub-networks with different connectivities are existing for sparse training, and all of them are capable of good performance. Instead of averaging sparse neural networks with the same sparse connectivity, it is more beneficial to average multiple sparse sub-networks with different connectivities since the sparse connectivity at initialization is insufficient to guarantee good performance.}  


{Following we compare our method with two SWA-based methods. First, we run SWA with an additional step of pruning before the averaging. Unfortunately, it conflicts with the goal of sparse training, leading to more training FLOPs. In contrast, our approach follows a sparse-to-sparse paradigm that just trains a fraction of the parameters during the whole training process.  Second, we train a sparse model from scratch without considering connection exploration. The results below have empirically evaluated the benefits of our method that achieves better performance while requiring much fewer training FLOPs.}




\begin{table*}[htbp]
\centering
\caption{{\textbf{Comparison with SWA.} Test accuracy (\%) and training FLOPs of ResNet-50 on CIFAR100. The training FLOPs are normalized with the dense model. SWA baseline$^1$ means we train a dense model until the first averaging operation, prune it to the target sparsity with magnitude pruning, and then run SWA without exploring sparse connectivity. SWA baseline$^2$ indicates we initialize a model to certain sparse levels and perform SWA without connection exploration.  }}



\label{table:SWA}
\resizebox{0.9\textwidth}{!}{
\begin{tabular}{lccc ccc}
\cmidrule[\heavyrulewidth](lr){1-7}

 \textbf{Method}     & \multicolumn{3}{c}{Accuracy} & \multicolumn{3}{c}{Training FLOPs ( $\times 9.74e18$)}  \\ 
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
 


 & 95\% Sparsity      & 90\% Sparsity    & 80\% Sparsity     
 &  95\% Sparsity      & 90\% Sparsity    & 80\% Sparsity         \\ 
 
 \cmidrule(lr){1-1}
\cmidrule(lr){2-4}
\cmidrule(lr){5-7}
SWA baseline$^1$
& 76.64$\pm$0.45 &  77.23$\pm$0.44 & 77.72$\pm$0.29 
& $0.91\times$ & $0.92 \times$  & $0.93 \times$
\\
SWA baseline$^2$
&75.66$\pm$0.45 &  76.67$\pm$0.14 &  77.50$\pm$0.36 
& $0.11\times$ & $0.18 \times$  & $0.30 \times$
\\

Sup-tickets (ours)
&\textbf{77.58$\pm$0.47}  &  {\textbf{78.52$\pm$0.39}}  &    {\textbf{78.69$\pm$0.30}}
& $0.11\times$ & $0.18 \times$  & $0.30 \times$
\\
\cmidrule(lr){1-7}


\end{tabular}}
\end{table*}

\bibliography{yin_307}
\end{document}


