\begin{table}[t]
\renewcommand{\arraystretch}{1.1}
\setlength{\tabcolsep}{0.5\tabcolsep}
\centering
\caption{\textbf{Ablation study on interleaved data scaling and pre-training data composition.} `S': speech input and output. `S$\rightarrow$T': speech input and text output. `T$\rightarrow$S': text input and speech output. ``Web Q." stands for Web Questions. ``Llama Q." stands for Llama Questions.}
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}lcccccc|cccccc@{}}
\toprule
\multirow{3}{*}{Model} & \multicolumn{6}{c|}{\textbf{Speech Language Modeling}} & \multicolumn{6}{c}{\textbf{Spoken Question Anwsering}} \\
 & \multicolumn{3}{c}{sTopic-StoryCloze} & \multicolumn{3}{c|}{sStoryCloze} & \multicolumn{2}{c}{Web Q.} & \multicolumn{2}{c}{Llama Q.} & \multicolumn{2}{c}{Trivia QA} \\
 & S & T$\rightarrow$S & S$\rightarrow$T & S & T$\rightarrow$S & S$\rightarrow$T & S & S$\rightarrow$T & S & S$\rightarrow$T & S & S$\rightarrow$T \\
 \midrule
9B (600B Interleave) & \textbf{82.9}  & \textbf{85.0} & \textbf{93.6} & \textbf{62.4} & \textbf{63.2} & \textbf{76.3} & \textbf{15.9} & \textbf{32.2} & \textbf{50.7} & \textbf{64.7} & \textbf{26.5} & \textbf{39.1} \\
- No Interleaving & 72.8 & 53.3 & 53.3 & 51.7 & 51.4 & 53.7 & 0.1 & 0.3 & 2.3 & 2.3 & 0.2 & 0.5  \\ 
- 100B Interleave & 80.9 & 83.6 & 93.3 & 59.4 & 61.3 & 73.4 & 9.3 & 25.4 & 37.0 & 60.0 & 11.7 & 26.9 \\
- 200B Interleave & 82.1 & 84.7 & 93.2 & 61.5 & 62.6 & 76.0 & 13.3 & 29.7 & 44.0 & 63.0 & 18.7 & 31.3 \\ \midrule
1.5B & 77.5 & 81.4 & \textbf{90.1} & 55.4 & 58.6 & 64.0 & 5.4 & \textbf{17.6} & 18.3 & \textbf{42.7} & 4.6 & 15.6 \\
- No Interleaving & 74.6 & 55.3 & 51.3 & 51.7 & 53.4 & 52.8 & 0.0 & 0.1 & 1.3 & 4.3 & 0.0 & 0.2 \\ 
- No Speech & 78.0 & 82.1 & 89.5 & 55.9 & 59.3 & 64.4 & 4.9 & 17.3 & 17.7 & 38.3 & 4.6 & 15.5  \\
- No Text & 78.5 & \textbf{83.2} & 89.0 & 54.8 & 58.3 & 63.4 & \textbf{6.3} & 17.4 & 20.3 & 42.3 & \textbf{7.0} & \textbf{16.3} \\
- No ASR \& TTS & \textbf{78.7} & 81.4 & 89.5 & \textbf{56.7} & \textbf{59.5} & \textbf{68.5} & 6.1 & 17.0 & \textbf{23.0} & 41.0 & 5.4 & 14.0 \\ 
% - Randomly-initialize \\
\bottomrule
\end{tabular}
}
\label{tab:pre-training-ablation}
\end{table}

\begin{figure}[]
    \vspace{-0.5em}
    \centering
    \begin{subfigure}[b]{0.32\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/sampling_rate.pdf}
         \caption{Sampling Rate}
         \label{fig:sampling_rate}
     \end{subfigure}
     \begin{subfigure}[b]{0.32\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/span_corruption_ratio.pdf}
         \caption{Span Corruption Ratio}
         \label{fig:span_corruption_ratio}
     \end{subfigure}
     \begin{subfigure}[b]{0.32\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/average_performance_interleaved.pdf}
         \caption{Interleaved Data Tokens}
         \label{fig:interleave_data_tokens}
     \end{subfigure}
    \caption{(a) Sampling rate vs average accuracy. (b) Span corruption ratio vs average accuracy. The accuracy is averaged over datasets of speech language modeling and spoken question answering. (c) Interleaved data tokens vs average performance after supervised fine-tuning.}
    \label{fig:pretrain-ablation}
\end{figure}