\begin{table}[H]
\centering
\caption{Training data of text-to-token LLM with 12.5Hz speech tokenizer synthesized by CosyVoice model on span-corruption data.}
\label{tab:tts-llm-breakdown-cosy-voice}
\begin{tabular}{ccccc}
\toprule
Language & Speech Hours & Speech Tokens & Text Tokens & Total Tokens \\ \midrule
Chinese  & 9,895      & 0.45B          & 0.13B        & 0.58B         \\
English  & 11,074     & 0.50B          & 0.16B        & 0.65B         \\ \bottomrule
\end{tabular}
\end{table}
