% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{hyperref}
\usepackage{xurl}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{multirow}
\usepackage{multicol}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{caption}
\usepackage{enumitem}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{SPDF: Sparse Pre-training and Dense Fine-tuning for Large Language Models\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:vithu@cerebras.net}{Vithursan Thangarasa}{}}
\author[1]{Abhay~Gupta}
\author[1]{William~Marshall}
\author[*]{Tianda~Li}
\author[1]{Kevin~Leong}
\author[*]{\\Dennis DeCoste}
\author[1]{Sean Lie}
\author[1]{Shreyas Saxena}
% Add affiliations after the authors
\affil[1]{%
    Cerebras Systems Inc.\\
    Sunnyvale, California, USA
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section*{Appendix}

\section{Experimental Setup and Hyperparameter Details}
\label{app:hyperparameters}

\subsection{Pre-training on PILE} To train all GPT models, we use AdamW
optimizer~\citep{loshchilov2017decoupled} with $\beta_1 = 0.9$, $\beta_2 =
0.999$ and $\epsilon = 10^{-8}$. The global norm is clipped at 1.0, and a weight
decay of 0.1 is used. There is a learning rate warm-up over the first 375M
tokens, followed by a cosine decay to 10\% of the peak learning rate. In
Table~\ref{tab:app-gpt-arch}, we provide details on the size and architecture
configurations of the models we pre-trained. Here, $n_{params}$ is the total
number of trainable parameters, $n_{layers}$ is the number of decoder layers,
and $d_{model}$ is the base size of the model. The feedforward bottleneck is
four times the base size, i.e., $d_{\text{ff}} = 4\times d_{model}$. Finally,
$n_{heads}$ are the number of attention heads and $d_{head}$ is the dimension of
each attention head.  The context window size is set to 2048
following~\citep{brown2020gpt3}.

\begin{table*}[h]
    \caption{ Sizes, architectures, and learning hyperparameters (batch size and
        learning rate) of the models, which are trained to Chinchilla optimal
        configurations ($\approx$ 20 tokens per parameter). }
    \label{tab:app-gpt-arch}
    \centering
    \makebox[\linewidth]{{
    \begin{tabular}{l|ccccc|cc|c}
        \toprule
        Model & $n_{params}$ & $n_{layers}$ & $d_{model}$ & $n_{heads}$  &
        $d_{head}$ & Batch Size & Learning Rate & Training Tokens \\ \midrule
        GPT-2 Small           &  125M & 12 & 768 & 12 & 64 & 256 & 6e-4 & 2.5B
        \\
        GPT-3 XL &  1.3B & 24 & 2048 & 16 & 128 & 512 & 2e-4 & 26B \\ 
        \bottomrule
    \end{tabular}
    } }
    \label{tab:app-gpt-pt}
\end{table*}

\paragraph{Sparsity Setup} As mentioned in Section~\ref{sec:method}, we use
random pruning (static sparsity) for training all our GPT models. The sparsity
is distributed uniformly across all layers (i.e., all layers get the same
specified sparsity level) irrespective of the number of parameters or FLOPs of a
given layer. We only sparsify all dense linear layers including the two in the
MLP module; $W_I$ (intermediate) and $W_O$ (MLP output projection), and the four
weight matrices in the self-attention module; $W_Q$ (query), $W_K$ (key), $W_V$
(value) and $W_D$ (attention output projection). In this setting, we keep the
embeddings (implemented as sparse lookups), LayerNorm~\citep{ba2016ln}, and
biases dense.

\subsection{Natural Language Generation}
Similar to~\citet{hu2022lora}, we train all of our GPT-2 Small and GPT-3 XL
models using AdamW~\citep{loshchilov2017decoupled} with a linear learning rate
schedule for 5 epochs, and perform early-stopping when the models began to
overfit. We perform a grid search to discover an appropriate learning rate that
led to the best downstream BLEU score on each of the tasks for a given compute
budget. More specifically, on the dense baseline and sparse variants, we  select
the best batch size among \{8, 16, 32, 64\} and select the best learning rate
among \{1e-4, 5e-5, 2.5e-5\} on the validation set. The beam search beam size,
length penalty, and no repeat ngram size  remain the same as described
in~\citep{hu2022lora}. 

\subsection{Curation Corpus}
We fine-tune the dense GPT-2 Small and its sparse variants on the Curation
Corpus~\citep{curationcorpusbase:2020} following the setup presented
in~\citep{rae2021gopher}. We fine-tune for 5 epochs on Curation Corpus, and
perform early stopping once the models start to overfit. To discover good
hyperparameters, we perform a grid search to discover an appropriate learning
rate that led to the best perplexity for a given compute budget. More
specifically, on the dense baseline and sparse variants, we use a batch size of
32 and select the best learning rate among \{3e-3, 1e-3, 3e-4, 1e-4, 3e-5,
1e-5\} on the validation set.

\subsection{Training FLOPs for Pre-training and Fine-tuning}
\label{app:train-flops}

We compute the total pre-training FLOPs for the dense and sparse variants of
GPT-2 Small and GPT-3 XL, and report them in Table~\ref{tab:flops-gpt}, along
with their relative FLOPs reduction over the dense baseline. We also compute the
total dense fine-tuning FLOPs for GPT-2 Small and GPT-3 XL on E2E, WebNLG, DART
and Curation Corpus, and report them in Table~\ref{tab:flops-ft-gpt}. The total
training FLOPs during the fine-tuning phase is a small fraction of the total
pre-training FLOPs, even though fine-tuning is performed in a dense manner.

\begin{table*}[]
    \caption{Details on the total pre-training FLOPs for each pre-trained model.. We also report the relative FLOPs reduction of the sparse variants over the dense baseline. We note that the reported FLOPs per sequence (seq) includes both forward and backward passes.}
    \label{tab:flops-gpt}
    \makebox[\linewidth]{\begin{tabular}{cc|cccc|c}
        \toprule
    Model       & \begin{tabular}[c]{@{}c@{}}Pre-Train\\ Sparsity\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Total \\ Seqs\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Total FLOPs/\\ Seq\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Total \\ FLOPs\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Toal \\ exaFLOPs\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}FLOPs Reduction \\ over Dense\end{tabular} \\
    \midrule \multirow{3}{*}{GPT-2 Small} & 0\% & 1.22e6 & 1.99e12 & 2.43e18 &
    2.43 & 1x \\
     & 50\%                                                         & 1.22e6 &
    1.47e12 & 1.79e18                                                & 1.79 &
    0.737x \\
     & 75\%                                                         & 1.22e6 &
    1.20e12 & 1.46e18                                                & 1.46 &
    0.601x \\
    \midrule \multirow{3}{*}{GPT-3 XL} & 0\% & 1.27e7 & 1.86e13 & 2.36e20 &
    236.10 & 1x \\
     & 50\%                                                         & 1.27e7 &
    1.12e13 & 1.42e20 & 141.87 & 0.601x \\
     & 75\%                                                         & 1.27e7 &
    7.46e12 & 9.48e19 & 94.76 & 0.401x                     
    \\\bottomrule                                                              
    \end{tabular}
    }
\end{table*}

\begin{table*}[!ht]
    \caption{Details on the total fine-tuning FLOPs for GPT-2 Small and GPT-3 XL on E2E, WebNLG, DART and Curation Corpus tasks. We note that the reported FLOPs per sequence (seq) includes both forward and backward passes.}
    \label{tab:flops-ft-gpt}
    \makebox[\linewidth]{\begin{tabular}{c|c|cccc}
        \toprule
    Dataset                          & Model       &
    \begin{tabular}[c]{@{}c@{}}Total \\ Seq\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Total FLOPs/\\ Seq\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Total \\ FLOPs\end{tabular} &
    \begin{tabular}[c]{@{}c@{}}Toal \\ exaFLOPs\end{tabular} \\ \midrule
    \multirow{2}{*}{E2E}             & GPT-2 Small & \multirow{2}{*}{1.26e5} &
    1.36e11                                                         & 5.15e16 &
    0.052                                                    \\ 
                                     & GPT-3 XL    & & 1.39e12 & 5.27e17 & 0.524
    \\\midrule \multirow{2}{*}{WebNLG}          & GPT-2 Small &
    \multirow{2}{*}{0.54e5} & 1.36e11 & 2.21e16 & 0.022 \\ 
                                     & GPT-3 XL    & & 1.39e12 & 2.26e17 & 0.226
    \\\midrule \multirow{2}{*}{DART}            & GPT-2 Small &
    \multirow{2}{*}{1.25e5} & 1.36e11 & 5.12e16 & 0.051 \\
                                     & GPT-3 XL    & & 1.39e12 & 5.24e17 & 0.524
    \\
    \midrule \multirow{2}{*}{Curation Corpus} & GPT-2 Small &
    \multirow{2}{*}{0.34e5} & 1.36e11 & 1.38e16 & 0.014 \\
                                     & GPT-3 XL    & & 1.39e12 & 1.41e17 & 0.141
                                     \\ \bottomrule                                                  
    \end{tabular}
    }
\end{table*}

% \vspace*{\fill} \clearpage
\section{Detailed Experiments on NLG and Text Summarization}
\label{app:detailedexps}

In Tables~\ref{tab:e2e},~\ref{tab:webnlg} and~\ref{tab:dart}, we provide
detailed results on all official evaluation metrics for the E2E, WebNLG and DART
tasks. Even across other metrics, we do not observe any significant drop in
performance with sparse pre-trained GPT-2 Small and GPT-3 XL models.
% \vspace*{\fill}
\begin{table*}[!ht]
    \caption{Downstream accuracy of GPT-2 Small and GPT-3 XL on E2E at different
    sparsity levels during pre-training. In the metric column, the direction of
    the arrow indicates better result (e.g., up indicates higher is better).}
    \label{tab:e2e}
    \makebox[\linewidth]{

    \begin{tabular}{cc|ccccc}
    \toprule
    \centering
    \multirow{2}{*}{Model}                        &
                                                  \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Pre-Train\\
                                                  Sparsity\end{tabular}} &
                                                  \multicolumn{5}{c}{E2E}
                                                  \\ \cmidrule{3-7} & &
                                                  BLEU$\uparrow$ &
                                                  NIST$\uparrow$ &
                                                  METEOR$\uparrow$ &
                                                  ROUGE-L$\uparrow$ &
                                                  CIDEr$\uparrow$ \\ \midrule
                                                  \multirow{3}{*}{GPT-2 Small}
                                                  & 0\% &
                                                  67.49\textsubscript{$\pm$
                                                  0.60}               &
                                                  8.59\textsubscript{$\pm$ 0.03}
                                                  & 46.08\textsubscript{$\pm$
                                                  0.22}           &
                                                  70.22\textsubscript{$\pm$
                                                  0.42} &
                                                  2.38\textsubscript{$\pm$ 0.03}
                                                  \\
                                                  & 50\% &
                                                  67.39\textsubscript{$\pm$
                                                  0.38} &
                                                  8.62\textsubscript{$\pm$ 0.03}
                                                  & 45.89\textsubscript{$\pm$
                                                  0.18} &
                                                  70.10\textsubscript{$\pm$
                                                  0.26} &
                                                  2.38\textsubscript{$\pm$ 0.01}
                                                  \\
                                                  & 75\% &
                                                  66.50\textsubscript{$\pm$
                                                  0.01} &
                                                  8.46\textsubscript{$\pm$ 0.13}
                                                  & 45.61\textsubscript{$\pm$
                                                  0.32} &
                                                  70.02\textsubscript{$\pm$
                                                  0.26}&
                                                  2.36\textsubscript{$\pm$ 0.02}
                                                  \\
    \midrule \multirow{3}{*}{GPT-3 XL} & 0\% & 68.10\textsubscript{$\pm$ 0.46} &
    8.64\textsubscript{$\pm$ 0.06} & 46.40\textsubscript{$\pm$ 0.01} &
    71.03\textsubscript{$\pm$ 0.22} & 2.41\textsubscript{$\pm$ 0.02} \\
    \multicolumn{1}{l}{}                          & 50\% &
    67.98\textsubscript{$\pm$ 0.63} & 8.62\textsubscript{$\pm$ 0.07} &
    46.40\textsubscript{$\pm$ 0.07} & 71.30\textsubscript{$\pm$ 0.20} &
    2.43\textsubscript{$\pm$ 0.02} \\
    \multicolumn{1}{l}{}                          & 75\% &
    67.66\textsubscript{$\pm$ 0.59} & 8.59\textsubscript{$\pm$ 0.09} &
    46.07\textsubscript{$\pm$ 0.26} & 70.40\textsubscript{$\pm$ 0.26} &
    2.42\textsubscript{$\pm$ 0.03} \\ \bottomrule
    \end{tabular}
    }
\end{table*}

\begin{table*}[]
    \caption{Downstream BLEU, NIST, METEOR, ROUGE-L and CIDEr scores of GPT-2
    Small and GPT-3 XL on WebNLG at different sparsity levels during
    pre-training. In the metric column, the direction of the arrow indicates
    better result (e.g., down indicates lower is better).}
    \label{tab:webnlg}
    \makebox[\linewidth]{
        \begin{tabular}{cc|ccc}
        \toprule
    \multirow{2}{*}{Model}                        &
                                                  \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Pre-Train\\
                                                  Sparsity\end{tabular}} &
                                                  \multicolumn{3}{c}{WebNLG}
                                                  \\ \cmidrule{3-5} & &
                                                  BLEU$\uparrow$ &
                                                  METEOR$\uparrow$ &
                                                  TER$\downarrow$ \\ \midrule
                                                  \multirow{3}{*}{GPT-2 Small}
                                                  & 0\% &
                                                  63.42\textsubscript{$\pm$
                                                  0.26}              & 0.44 &
                                                  0.34           \\
                                                  & 50\% &
                                                  63.10\textsubscript{$\pm$
                                                  0.13}   
                                                  & 0.44 &    0.34 \\
                                                  & 75\% &
    62.64\textsubscript{$\pm$ 0.22} &             0.43         &          0.34
    \\
    \midrule \multirow{3}{*}{GPT-3 XL} & 0\% & 63.62\textsubscript{$\pm$ 0.23} &
    0.45 & 0.32 \\
    \multicolumn{1}{l}{}                          & 50\% &
    63.47\textsubscript{$\pm$ 0.21} & 0.45 & 0.33 \\
    \multicolumn{1}{l}{}                          & 75\% &
    63.06\textsubscript{$\pm$ 0.11} & 0.45 & 0.33 \\
    \bottomrule
    \end{tabular}
    }
\end{table*}


\begin{table*}[!ht]
    \caption{Downstream BLEU, MET and TER scores of GPT-2 Small and GPT-3 XL on
    DART at different sparsity levels during pre-training. In the metric column,
    the direction of the arrow indicates better result (e.g., down indicates
    lower is better).}
    \label{tab:dart}
    \makebox[\linewidth]{
        \begin{tabular}{cc|ccc}
        \toprule
    \multirow{2}{*}{Model}                        &
                                                  \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Pre-Train\\
                                                  Sparsity\end{tabular}} &
                                                  \multicolumn{3}{c}{DART}
                                                  \\ \cmidrule{3-5} & &
                                                  BLEU$\uparrow$ & MET$\uparrow$
                                                  & TER$\downarrow$ \\ \midrule
                                                  \multirow{3}{*}{GPT-2 Small}
                                                  & 0\% &
                                                  46.30\textsubscript{$\pm$
                                                  0.16}             & 0.38 &
                                                  0.51                  \\
                                                  & 50\% &
                                                  45.74\textsubscript{$\pm$
                                                  0.10} & 0.37 &   0.51 \\
                                                  & 75\% &
    44.97\textsubscript{$\pm$ 0.11} &            0.37          &           0.52
    \\
    \midrule \multirow{3}{*}{GPT-3 XL} & 0\% & 47.71\textsubscript{$\pm$ 0.11} &
    0.39 & 0.49 \\
    \multicolumn{1}{l}{}                          & 50\% &
    47.10\textsubscript{$\pm$ 0.13} & 0.39 & 0.49 \\
    \multicolumn{1}{l}{}                          & 75\% &
    46.96\textsubscript{$\pm$ 0.08} & 0.38 & 0.50 \\
    \bottomrule
    \end{tabular}
    }
\end{table*}


% \begin{table}[] \caption{Downstream test perplexity (PPL) of GPT-2 Small and
%     GPT-3 XL on Curation Corpus at different sparsity levels during
%     pre-training. In the metric column, the direction of the arrow indicates
%     better result (e.g., down indicates lower is better).} \centering
%     \makebox[\linewidth]{ \begin{tabular}{cc|c} \toprule
%     \multirow{2}{*}{Model}                        &
%     \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Pre-Train\\
%                                                   Sparsity\end{tabular}} &
%                                                   Curation Corpus \\
%                                                   \cmidrule{3-3} & &
%                                                   PPL$\downarrow$ \\ \midrule
%                                                   \multirow{3}{*}{GPT-2 Small}
%                                                   & 0\% &           13.38
%                                                   $\pm$ 0.02 \\
%                                                   & 50\% &         15.09 $\pm$
%                                                   0.04 \\
%                                                   & 75\% &       17.14 $\pm$
%     0.01     \\ \midrule \multirow{3}{*}{GPT-3 XL} & 0\% & 8.28 $\pm$ 0.01 \\
%     \multicolumn{1}{l}{}                          & 50\% & 9.21 $\pm$ 0.02 \\
%     \multicolumn{1}{l}{}                          & 75\% & 11.03 $\pm$ 0.02\\
%     \bottomrule \end{tabular}
%     }
% \end{table} \vspace*{\fill}
\clearpage
\section{Unstructured Sparsity on Specialized Hardware
Accelerators}~\label{app:unstructured_sparse_nongpu}

In Figure~\ref{fig:matmulspeedup} we highlight the potential realized gains with
unstructured weight sparsity on the Cerebras CS-2. This figure was regenerated
based on the plot in~\citep{lie_2021}.
% \vspace*{\fill}
\begin{figure*}[!ht]
    \centering
    \includegraphics[keepaspectratio=true, width=0.45\linewidth]{./figures/gpt_speedup.pdf}
    \caption{Measured speedup versus theoretical speedup at varying sparsity
    levels for a GPT-3 layer 12k $\times$ 12k matrix multiplication
    (MatMul)~\citep{lie_2021}.}
    \label{fig:matmulspeedup}
\end{figure*}


\clearpage
\bibliography{refs}

\end{document}
