\begin{wrapfigure}{r}{0.5\textwidth}
\centering
\setlength{\tabcolsep}{0.5\tabcolsep}
\caption{Statistics of the four data types used in pre-training}
\label{tab:data_statistics}
\begin{tabular}{lccc}
\toprule
\textbf{Data Type} & \textbf{Speech} & \textbf{Text} & \textbf{Audio} \\
 & \textbf{(hours)} & \textbf{Tokens} & \textbf{Tokens} \\
\midrule
Unsup. Text & - & 10T & - \\
\midrule
Unsup. Speech (EN) & Y1 & - & Z1 \\
Unsup. Speech (ZH) & Y2 & - & Z2 \\
\midrule
Interleaved (EN) & W1 & V1 & U1 \\
Interleaved (ZH) & W2 & V2 & U2 \\
\midrule
Supervised (EN) & S1 & T1 & R1 \\
Supervised (ZH) & S2 & T2 & R2 \\
\midrule
\textbf{Total} &  \\
\bottomrule
\end{tabular}
\end{wrapfigure}