\begin{table}[t!]
\caption{Statistics of datasets for language modeling.}
\vspace{-2em}
 \label{tab:stat}
\begin{center}
\begin{small}
\setlength{\tabcolsep}{2.5pt}
\begin{sc}
\begin{tabular}{lrrccr}
\toprule
Dataset & \#sent &  min$_l$ & max$_l$ & avg$_l$ & \#word \\
 \midrule
$\mathtt{PTB}$-train & 41,931 & 2 & 82 & 21.2 & 887,384 \\
$\mathtt{PTB}$-valid & 3,357 & 2 & 74 & 21.0 & 70,377 \\
$\mathtt{PTB}$-test & 3,756 & 2 & 77 & 20.9 & 78,664 \\
\midrule
$\mathtt{Yahoo}$-train & 100,000 & 200 & 20 & 78.7 & 7,872,281 \\
$\mathtt{Yahoo}$-valid & 10,000 & 200 & 20 & 79.1 & 790,680 \\
$\mathtt{Yahoo}$-test & 10,000 & 200 & 20 & 78.9 & 788,673 \\
\midrule 
$\mathtt{Yelp}$-train & 100,000 & 201 & 20 & 96.0 & 9,603,135 \\
$\mathtt{Yelp}$-valid & 10,000 & 200 & 20 & 96.1 & 961,392 \\
$\mathtt{Yelp}$-test & 10,000 & 200 & 20 & 95.7 & 956,556 \\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vspace{-2em}
\end{table}