
\begin{table}[ht]
    \centering
    \begin{tabular}{|l|c|c|c|c|}
        \hline
        \textbf{Language} & \textbf{Sentences} & \textbf{Tokens} & \textbf{Avg. Length} & \textbf{Max Length} \\
        \hline
        Amharic (amh) & 667,021 & 7,123,930 & 10.68 & 508 \\
        \hline
        Afan Oromo (oro) & 667,021 & 8,369,542 & 12.55 & 474 \\
        \hline
        \textbf{Total} & 1,334,042 & 15,493,472 & 11.61 & 508 \\
        \hline
    \end{tabular}
    \caption{Word count distribution of the Amharic and Afan Oromo sentences in the dataset.}
    \label{tab:word_count_distribution}
\end{table}



\begin{table}[ht]
    \centering
    \begin{tabular}{|l|c|c|c|c|}
        \hline
        \textbf{Category} & \textbf{Short} & \textbf{Medium} & \textbf{Long} & \textbf{Total} \\
        \hline
        \textbf{Original Corpus} & 399,588 (59.91\%) & 201,925 (30.27\%) & 65,508 (9.82\%) & 667,021 (100\%) \\
        \hline
        \textbf{Training Set} & 319,904 (59.95\%) & 161,400 (30.25\%) & 52,312 (9.80\%) & 533,616 (100\%) \\
        \hline
        \textbf{Validation Set} & 40,046 (60.04\%) & 20,081 (30.11\%) & 6,575 (9.86\%) & 66,702 (100\%) \\
        \hline
        \textbf{Test Set} & 39,638 (59.42\%) & 20,444 (30.65\%) & 6,621 (9.93\%) & 66,703 (100\%) \\
        \hline
    \end{tabular}
    \caption{Distribution of sentence lengths across the original corpus and its subsets.}
    \label{tab:dataset_distribution}
\end{table}
