\begin{table}[t]
\centering
\setlength{\tabcolsep}{0.5\tabcolsep}
\caption{\textbf{Speech Reconstruction Results.} We evaluate semantic retention with Word Error Rate (WER) and reconstruction quality with VisQOL~\citep{visqol} and MOSNet~\citep{MOSNet} for different speech tokenizers across various sample rates. The baseline results are independently evaluated by us.}
% \resizebox{\textwidth}{!}{
\begin{tabular}{lccccccc}
\toprule
\multirow{2}{*}{\textbf{Model}} & \multirow{2}{*}{{\shortstack{\textbf{Sample}\\\textbf{Rate}}}} & \multirow{2}{*}{\textbf{Bitrate}} & \multirow{2}{*}{\textbf{Causal}} & \multicolumn{2}{c}{\textbf{LibriSpeech}} \\
\cmidrule(lr){5-7}
 & &  &  & WER$\downarrow$ & VisQOL$\uparrow$ & MOSNet$\uparrow$ \\
\midrule
Ground Truth & - & - & - & 4.62 & - & 3.27 \\
RVQGAN & 75Hz & 1.50K & \xmark & - & 1.74 & 2.74 \\
SemantiCodec & 50Hz & 1.30K & \xmark & - & 2.43 & 3.12 \\
SpeechTokenizer & 50Hz & 1.50K & \xmark & - & 1.53 & 2.67 \\
SpeechTokenizer & 50Hz & 4.00K & \xmark & - & 3.07 & 3.10 \\
Spirit-Base & 25Hz & 225.0 & \xmark  & 11.66  & - & - \\
Spirit-Expressive & 38.5Hz & 307.0 & \xmark & 10.60  & - & - \\
Moshi (Mimi) & 12.5Hz & 1.10K & \cmark & 8.36   & 2.82 & 2.89 \\
\midrule
\multirow{4}{*}{Ours} & 50Hz & 180.6 & \cmark & 6.24  & 2.67 &3.38 \\
 & 25Hz & 90.3 & \cmark & 6.80   & 2.60 & 3.33 \\
 & 12.5Hz & 52.7 & \cmark  & 8.43    & 2.52 & 3.39 \\
 & 6.25Hz & 30.1 & \cmark  & 14.41  & 2.34 & 3.24 \\
\bottomrule
\end{tabular}
% }
\label{tab:speech_tokenizer}
\end{table}