\begin{table}[t]
\centering
% \setlength{\tabcolsep}{0.5\tabcolsep}
\caption{\textbf{Evaluation results for end-to-end spoken chatbots.} All baseline results in this table were obtained through our own evaluation.}
\begin{tabular}{lccccc}
    \toprule
    & \multirow{2}{*}{\shortstack{w/o\\Text}} & \multicolumn{2}{c}{\textbf{Content Quality}} & \multicolumn{2}{c}{\textbf{Speech Quality}} \\
    \cmidrule(lr){3-4} \cmidrule(l){5-6}
    & & General QA$\uparrow$ & Knowledge$\uparrow$ & UTMOS$\uparrow$ & ASR-WER$\downarrow$ \\
    \midrule
    SpeechGPT   & \textcolor{red}{\xmark} & 1.40 & 2.20 & 3.86 & 66.57 \\ 
    Mini-Omni   & \textcolor{red}{\xmark} & 2.44 & 1.10 & 3.17 & 25.28 \\ 
    Llama-Omni & \textcolor{red}{\xmark} & 3.50 & 3.90 & 3.92 & 9.18 \\ 
    Moshi  & \textcolor{red}{\xmark} & 2.42 & 3.60 & 3.90 & 7.95 \\
    \midrule
    9B + Text-guided & \textcolor{red}{\xmark} & \textbf{3.69} & \textbf{4.70} & \textbf{4.33} & 7.83 \\ 
    - No Interleaving  & \textcolor{red}{\xmark} & 2.48 & 1.00 & 4.31 & 10.34 \\ 
    - No Pre-training  & \textcolor{red}{\xmark} & 2.20 & 0.90 & 4.32 & \textbf{6.11} \\ 
    \midrule
     9B & \textcolor{green}{\cmark} & 3.18 & 3.20 & 4.33 & $\emptyset$ \\ 
    - No Interleaving  & \textcolor{green}{\cmark} & 1.21 & 0.01 & 4.33 & $\emptyset$ \\ 
    - No Pre-training  & \textcolor{green}{\cmark} & 1.10 & 0.00 & 4.32 & $\emptyset$ \\ 
    \bottomrule
\end{tabular}
\label{tab:alignment}
\end{table}
