\begin{wraptable}{r}{0.5\textwidth}
\centering
\vspace{-1em}
\caption{\textbf{Word Error Rate (WER) of the text-to-token model}. The dataset labeled "Interleaved Data" refers to text and speech pairs generated during the construction of interleaved speech-text data.}
\begin{tabular}{lcc}
\toprule
\textbf{Dataset} & \textbf{Language} & \textbf{WER (\%)} \\
\midrule
\multirow{1}{*}{VCTK} & English & 3.20 \\
\midrule
\multirow{2}{*}{Interleaved Data} & English & 11.62 \\
                          & Chinese & 9.34 \\
\bottomrule
\end{tabular}
\label{tab:ttsllm_wer}
\vspace{-1em}
\end{wraptable}