% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{amsmath} 
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{array} 
\usepackage[compress]{natbib}


% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{From Bias to Balance: How Multilingual Dataset Composition Affects Tokenizer Performance Across Languages}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{
Aishwarya Selvamurugan\inst{1}\orcidID{0009-0001-3731-2062} \and
Raj Dandekar\inst{2} \and
Rajat Dandekar\inst{2} \and
Sreedath Panat\inst{2}
}

\authorrunning{A. Selvamurugan et al.}

\institute{
Sri Eshwar College of Engineering, Coimbatore, India \\
\email{aishh2305@gmail.com (Corresponding author)}
\and
Vizuara AI Labs, Pune, India \\
\email{raj@vizuara.com, rajatdandekar@vizuara.com, sreedath@vizuara.com}
}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Tokenization serves as a crucial preprocessing step in multilingual language models, affecting performance in both high-resource and low-resource languages. However, current tokenizers seem to adopt language biases due to unbalanced training datasets, leading to a poorly optimized tokenizer for underrepresented languages. This research examines the impact of balanced multilingual datasets on the performance of tokenizers trained with the Byte Pair Encoding, WordPiece, and Unigram Language Model algorithms. We build balanced corpora from various sources to study the impact of vocabulary size on 15k, 30k, 50k dataset scales. The trained tokenizers are assessed through intrinsic metrics, including Subword Fertility and Normalized Sequence Length, as well as through extrinsic performance on downstream tasks like Part-of-Speech tagging, Named Entity Recognition, and Machine Translation. We build custom data sets along with customized evaluation pipelines to enable consistent comparisons across nine languages using models built into standard NLP frameworks. Our observations reinforce the importance of a balanced dataset when training tokenizers and, in turn, advance the development of equitable and robust multilingual NLP systems.

\keywords{multilingual tokenization \and subword segmentation \and tokenization bias \and language fairness \and Byte Pair Encoding \and WordPiece \and Unigram Language Model \and cross-lingual NLP \and low-resource languages \and multilingual BERT \and tokenizer evaluation \and morphologically rich languages}

\end{abstract}
%
%
%
\section{Introduction}


Tokenization is a fundamental step in natural language processing (NLP), serving as the bridge between raw text and model input. It enables diverse linguistic structures to be converted into standardized forms that can be effectively processed by deep learning models \cite{sennrich2016neural}. This conversion becomes particularly critical in multilingual settings, where vocabulary overlap across languages is often limited \cite{conneau2020unsupervised}. Subword-based tokenization strategies have emerged as a dominant solution because they alleviate the out-of-vocabulary (OOV) problem by segmenting unseen or rare words into smaller known units \cite{kudo2018sentencepiece}. Such an approach is especially advantageous for morphologically rich and low-resource languages, where word formation processes generate a vast number of infrequent word forms.

Different tokenization schemes optimize various trade-offs between vocabulary compression, sequence length, and downstream model efficiency. Importantly, tokenization quality has been shown to directly affect both model efficiency and fairness, with higher subword fertility in low-resource languages often resulting in longer token sequences and reduced accuracy \cite{lindsey2024comparison}.

Multilingual pretrained models such as mBERT \cite{devlin2019bert} and XLM-R \cite{conneau2020unsupervised} rely on shared subword vocabularies across languages. However, this approach can result in token collisions, inconsistent granularity, and biased performance, particularly disadvantaging underrepresented languages. Recent studies have highlighted that tokenizer design and vocabulary allocation can encode systemic bias, resulting in token inflation or inadequate coverage for certain scripts \cite{counting_ability_2024}.

Despite their widespread use, existing tokenization strategies disproportionately favor high-resource and Latin-script languages. This leads to over-segmentation of low-resource or non-Latin languages, thereby inflating sequence lengths and reducing efficiency \cite{petrov2023language}. Empirical evidence shows that metrics such as subword fertility, normalized sequence length (NSL), and parity vary significantly across languages and directly impact downstream tasks \cite{rust2021how}. Furthermore, tokenizer biases can impose up to 68\% additional training costs when multilingual models are trained on skewed datasets dominated by European languages \cite{ali2023tokenizer}. Beyond computational concerns, these disparities introduce broader societal and economic consequences. Languages that undergo excessive token fragmentation incur higher API usage costs and slower processing times, disproportionately affecting communities that already face digital marginalization \cite{rust2021how,cost_languages_2023}. Such disparities not only undermine fairness and representation but also exacerbate social and linguistic inequalities in multilingual NLP systems.

In this work, we aim to systematically investigate how balanced multilingual corpora can mitigate such disparities in tokenizer performance. Specifically, we analyze nine typologically diverse languages (Yoruba, Arabic, Mandarin Chinese, Russian, Hindi, Japanese, Swahili, Bengali, and Turkish) selected to represent different language families, writing systems, and morphological complexity levels. Our evaluation combines intrinsic metrics (e.g., subword fertility, normalized sequence length) with extrinsic performance on downstream tasks such as part-of-speech (POS) tagging, named entity recognition (NER), and machine translation. By adopting balanced datasets derived from Wikipedia and OSCAR, we seek to assess whether equitable data representation can yield fairer and more efficient tokenizers across languages.

The scope of this study is deliberately focused on subword-based tokenizers, excluding character-level and neural tokenization approaches. Additionally, while we evaluate downstream tasks using curated datasets, the experiments do not extend to full-scale pretrained large language models. Data limitations, particularly for low-resource languages such as Yoruba in NER tasks, further shape the boundaries of our investigation. Nonetheless, this work contributes toward a deeper understanding of multilingual tokenization fairness and provides empirical evidence for the importance of balanced data in tokenizer design.

The remainder of this paper is organized as follows: Section 2 reviews related work on multilingual tokenization bias and fairness. Section 3 presents our methodology, including dataset construction, tokenizer training, and evaluation frameworks across POS tagging, NER, and machine translation tasks. Section 4 reports experimental results comparing tokenizer performance using both intrinsic metrics and downstream task evaluations. Finally, Section 5 discusses our findings, acknowledges limitations, and outlines directions for future research.

\section{Related Works}


\subsection{Tokenization Algorithms}
The three primary subword tokenization algorithms examined in this study optimize different linguistic properties. BPE \cite{sennrich2016neural} uses frequency-driven merging of character pairs, making it effective for capturing morphological patterns but potentially inconsistent across diverse languages. WordPiece \cite{devlin2019bert} extends BPE with likelihood-based merging, offering balanced performance particularly suitable for morphologically aware tasks. Unigram Language Model  \cite{kudo2018subword} employs a probabilistic framework, starting with comprehensive vocabularies and pruning based on likelihood, often yielding more linguistically motivated segmentations for morphologically rich languages.


\subsection{Fairness, Dataset Composition and Evaluation metrics}
Fairness in multilingual NLP has been extensively studied, with tokenization identified as a fundamental source of bias \cite{blodgett2020language}. Conneau et al. (2020) demonstrated that balanced multilingual training data improves cross-lingual transfer performance \cite{conneau2020unsupervised} , while Ahia et al. (2023) found substantial improvements in African languages when using balanced corpora \cite{ahia2023low}. However, most studies focus on model training rather than tokenizer development itself. Tokenization quality assessment employs both intrinsic metrics like Subword Fertility \cite{lindsey2024comparison} and Normalized Sequence Length, and extrinsic performance on downstream tasks. Rust et al. (2021) expanded evaluation frameworks to include parity metrics measuring cross-lingual consistency \cite{rust2021how}.

\subsection{Alternative Approaches}
Character-level approaches like CANINE \cite{clark2022canine} and ByT5 \cite{xue2021byt5} eliminate tokenization bias by operating directly on Unicode characters, though at increased computational cost \cite{lindsey2024comparison}. Recent adaptive methods such as MAGNET propose learnable segment boundaries to reduce over-segmentation \cite{blasi2024magnet}, representing a shift toward dynamic, context-aware tokenization.
This work addresses existing gaps by systematically evaluating tokenization algorithms across nine typologically diverse languages using balanced training corpora, providing comprehensive analysis of both intrinsic metrics and downstream task performance to advance equitable multilingual tokenization.


\section{Methodology}

\subsection{Tokenizers}
\label{sec:tokenizers}
\subsubsection{Dataset Generation}

The dataset for tokenizer training was assembled as a typologically diverse corpus from two public sources: language-specific Wikipedia dumps\footnote{\url{https://dumps.wikimedia.org/}} and the OSCAR Common Crawl-derived corpus \footnote{\url{https://oscar-project.org/}}. Wikimedia XML dumps served as the primary encyclopaedic source, while OSCAR provided complementary broader coverage and domain diversity from web-crawl data for all nine target languages: Yoruba, Arabic, Mandarin Chinese, Russian, Hindi, Japanese, Swahili, Bengali, and Turkish. We targeted equal per-language character allocations for all target languages to reduce high-resource bias during tokenizer training. A deliberately balanced sampling strategy was employed, as prior work shows that tokenizer training data composition and per language representation materially affect downstream performance \cite{zhang2022robust}. To investigate the impact of training corpus size on vocabulary learning and downstream tasks, we created three balanced datasets containing 100 million, 200 million, and 400 million characters.



\subsubsection{Data Preprocessing}
\label{sec:preprocessing}
A uniform normalization procedure was applied across all corpora to ensure consistency between languages and sources. The preprocessing pipeline consisted of four sequential steps. First, text repair was performed using the \texttt{ftfy} library's \texttt{fix\_text()} function to correct smart quotes, inconsistent punctuation, and other common encoding anomalies. Second, Unicode normalization was applied via the Python \texttt{unicodedata} module, converting all characters to NFKC (Normalization Form Compatibility Composition) form to achieve canonical equivalence, compatibility decomposition, and recomposition. Third, non-printable Unicode control characters (e.g., \texttt{\textbackslash x00--\textbackslash x1F}, excluding standard whitespace) were removed to eliminate formatting artifacts. Finally, whitespace normalization was conducted by collapsing consecutive spaces, tabs, and newline characters into single spaces and trimming leading or trailing whitespace using regular expressions. Only non-empty, normalized lines were retained, resulting in a clean and consistent dataset for tokenizer training.


\subsubsection{Tokenizer training}
We set up the processing pipelines using Hugging Face tokenizers and transferred the preprocessed data. Three tokenization algorithms were employed:
\begin{itemize}
    \item Byte Pair Encoding (BPE)
    \item WordPiece
    \item Unigram
\end{itemize}

A total of nine tokenizer models were trained on the normalized, balanced corpus for nine languages, with three vocabulary sizes: 15k, 30k, and 50k. For BPE and WordPiece, a whitespace-based pre-tokenization strategy was applied, consistent with their standard implementations, to preserve word boundaries. In contrast, the Unigram model operated directly on raw text by design, ensuring full character coverage and better handling of scripts without reliable whitespace delimiters. The detailed configurations for each tokenizer are shown in Table~\ref{tab:tokenizer_config} and Table ~\ref{tab:tokenizer_config_pre_tokenization_coverage}, providing a concise overview of the implementation, vocabulary sizes, and preprocessing strategies used.




\begin{table*}[h!]
\centering
\caption{Tokenizer configurations: Model properties.}
\label{tab:tokenizer_config}
\begin{tabular}{|l|l|l|l|}
\hline
\textbf{Tokenizer Model} & \textbf{Library} & \textbf{Algorithm Type} & \textbf{Vocab Sizes Used} \\ \hline
HF-BPE & HF Tokenizers & Byte-Pair Encoding & 15k, 30k, 50k \\ \hline
HF-WordPiece & HF Tokenizers & WordPiece & 15k, 30k, 50k \\ \hline
SP-Unigram & SentencePiece & Unigram LM & 15k, 30k, 50k \\ \hline
\end{tabular}
\end{table*}

\begin{table*}[h!]
\centering
\caption{Tokenizer configurations: Pre-tokenization and coverage.}
\label{tab:tokenizer_config_pre_tokenization_coverage}
\small
\begin{tabular}{|p{3cm}|p{2.5cm}|p{5cm}|p{2cm}|}
\hline
\textbf{Tokenizer Model} & \textbf{Pre-tokenization} & \textbf{Special Tokens} & \textbf{Character Coverage} \\ \hline
HF-BPE & Whitespace & [PAD], [UNK], [CLS], [SEP], [MASK] & 100\% \\ \hline
HF-WordPiece & Whitespace & [PAD], [UNK], [CLS], [SEP], [MASK] & 100\% \\ \hline
SP-Unigram & None (raw text) & \textless pad\textgreater, \textless unk\textgreater & 100\% \\ \hline
\end{tabular}
\end{table*}

This design allows for a direct comparison of the three algorithms under different vocabulary sizes, ensuring that downstream evaluation results can be attributed to model differences rather than data or preprocessing inconsistencies.


\subsubsection{Evaluation metrics}

To access the quality of the trained tokenizer beyond vocabulary coverage, we evaluated them using 2 metrics: Normalized Sequence Length (NSL) and Subword Fertiltiy, to compare the tokenization behaviour across languages and vocabulary sizes.





\paragraph{Normalized Sequence Length}

Normalized Sequence Length (NSL) measures the average number of tokens per character in a sequence:

\begin{equation}
\text{NSL} = \frac{\text{Number of tokens}}{\text{Number of characters}}
\end{equation}

This metric shows the granularity of segmentation, higher NSL values suggest more fine-grained tokenization.


\paragraph{Subword Fertility}

Subword Fertility measures the average number of subword tokens produced per whitespace-delimited word:

\begin{equation}
\text{Subword Fertility} = \frac{\text{Number of tokens}}{\text{Number of words}}
\end{equation}

A higher subword fertility value indicates that words are split into more subword units, which can be beneficial for handling rare words but may lead to longer sequences.

For evaluation, we curated language balanced dataset from publicly available TATOEBA corpus \footnote{\url{https://tatoeba.org/}} for Yoruba and Bengali, and the TED2020 corpus \footnote{\url{https://opus.nlpl.eu/TED2020.php}} for Arabic, Mandarin Chinese, Russian, Hindi, Japanese, Swahili, and Turkish. For each language, we randomly sampled 50 sentences from the respective sources. For every sentence, we computed NSL and Subword Fertlity, then averaged these scores per tokenizer. The results are given in Table ~\ref{tab:nsl_subword_fertility1}.

\subsection{Downstream tasks}

\subsubsection{Parts of speech (POS) Tagging}

\paragraph{Data collection}

For the POS tagging task, we assembled a diverse language balanced dataset for the same 9 languages from publicly available sources. Specifically, Arabic, Mandarin Chinese, Russian, Hindi, Japanese, and Turkish data were obtained from the Universal Dependencies treebanks, whereas Yoruba and Swahili were taken from the MasakhaPOS corpus and Bengali was sourced from the NLTK Indian corpus.

\paragraph{Preprocessing}

The preprocessing phase focused on ensuring uniformity in format and quality across all nine languages. For each dataset, only sentences in which the number of tokens exactly matched the number of POS tags were retained. All corpora were then standardized into a unified JSONL structure with three fields: tokens (list of words), tags (corresponding POS labels), and lang (ISO language code). Language codes were explicitly added to each sentence to maintain traceability. Following the balancing process, the combined dataset was randomly shuffled to mitigate any potential ordering effects during training. The final multilingual POS dataset was saved in UTF-8 encoding without altering the original text content, preserving the integrity of the gold-standard annotations.

\paragraph{Model Training}

For POS tagging, we fine-tuned a BERT-based token classification model on the balanced multilingual dataset using the 9 different tokenizers of varying sizes and segmentation strategies which is trained. Each tokenizer which was trained separately are loaded into the Hugging Face AutoTokenizer to ensure consistent tokenization between training and evaluation. The POS model was implemented using BertForTokenClassification with the base architecture bert-base-cased, adapting the embedding layer to match each tokenizer’s vocabulary size. Input sequences were tokenized in a word-aligned manner, with subword tokens inheriting the POS label of their originating word and non-aligned tokens masked with the -100 label to exclude them from loss computation. Training was performed with a batch size of 16, a learning rate of 5e-5, and a fixed three-epoch schedule.



\subsection{NER}

\paragraph{Data collection}

We constructed a balanced multilingual NER dataset covering all the nine languages. For Yoruba, we used the MasakhaneNER dataset in CoNLL format, containing manually annotated tokens and entity labels. For the other eight languages, we used the WikiANN multilingual NER corpus via the Hugging Face Datasets library. 


\paragraph{Preprocessing}

The Yoruba dataset was first parsed from CoNLL format into a token-tag JSON structure, handling any malformed lines by assigning an ``O'' (non-entity) tag. To unify label formats across datasets, we extracted the label set from WikiANN and mapped the Yoruba entity tags to this schema, discarding any samples containing unsupported tags. All datasets were standardized to a common feature schema consisting of \texttt{tokens}, \texttt{ner\_tags}, and \texttt{language} fields, with \texttt{ner\_tags} stored as integer indices corresponding to the unified label list. We balanced the dataset by downsampling each language to an equal number of samples (double the smallest split size between WikiANN datasets), shuffled the combined set, and split it into 80\% training and 20\% test subsets.



\paragraph{Model Training}


For the Named Entity Recognition (NER) experiments, a BERT-based token classification architecture was employed, trained using all 9 tokenizer configurations. The datasets were tokenized in a manner that preserved token–tag alignment, with subword units inheriting their corresponding word level labels. To address class imbalance in the entity label distribution, class weights were computed from the training corpus and incorporated into the loss function, improving recognition of less frequent entity classes. Each configuration was finetuned under the same hyperparameter settings for a fixed number of epochs, ensuring comparability across runs. Performance was evaluated using entity-level precision, recall, F1-score, and accuracy, following the seqeval metric standard for sequence labeling.


\subsection{Machine Translation}

\paragraph{Data Collection}

For the Machine Translation experiments, parallel corpora were sourced from two large-scale open datasets: OPUS100 and TED2020. OPUS100 provides high-quality sentence-aligned translations across 100 languages, while TED2020 consists of transcribed and translated TED Talk segments in multiple languages. For languages with limited high-quality coverage in OPUS100, such as Swahili, TED2020 served as the primary source. All datasets were obtained from official repositories, ensuring consistent formatting and alignment between English and the respective target languages.

\paragraph{Preprocessing}

We applied a systematic preprocessing pipeline to ensure corpus integrity and comparability. Quality filtering removed sentence pairs with identical source/target text, segments under three English tokens or two target tokens, sentences exceeding 200 characters, and text dominated by numbers or punctuation. Technical strings (HTML/XML tags, encoding markers) were excluded to eliminate non-linguistic noise. To mitigate high-resource language bias, we downsampled all datasets to match the smallest corpus size post-filtering, ensuring uniform per-language representation. The dataset was randomized with a fixed seed and split 90-10 for training/testing. Records were standardized with English sentences, translations, and language codes, producing a clean, balanced multilingual dataset for rigorous cross-linguistic evaluation.


\paragraph{Model Training}

We conducted machine translation experiments using BART-large with custom tokenizers across different vocabulary sizes and algorithms. Models were trained using Seq2SeqTrainer for 5 epochs with batch size 8, gradient accumulation steps 8 (effective batch size 64), learning rate 1e-4, 10\% warmup, weight decay 0.01, and FP16 precision. We trained a single multilingual model on nine languages simultaneously, with inputs truncated/padded to 256 tokens and language-specific formatting. Training stability was ensured through label smoothing (0.1) and gradient clipping (max norm 1.0). Generation used beam search (2 beams, max length 128). Models were evaluated every 500 steps using BLEU scores and exact match accuracy on the complete balanced multilingual dataset, enabling direct comparison across tokenization strategies.

\section{Experimental Results}



\subsection{Intrinsic Tokenizer Evaluation}

The intrinsic evaluation compares BPE, WordPiece, and SentencePiece-Unigram tokenizers for the vocabulary sizes 15k, 30k, and 50k using Normalized Sequence Length (NSL) and Subword Fertility as evaluation metrics \cite{rust2021how,acs2019subword}. NSL captures the relative tokenization length with respect to the original text, while Subword Fertility quantifies the average number of subword units generated per original token {mielke2021between}. Lower values for both metrics generally indicate more compact and efficient tokenization \cite{kudo2018sentencepiece,salesky2020linguistically}.


As shown in Table ~\ref{tab:nsl_subword_fertility1}, increasing vocabulary size consistently reduces both metrics, yielding more compact token sequences, with the effect most pronounced in Mandarin Chinese and Japanese due to their logographic writing systems \cite{kudo2018sentencepiece,conneau2020unsupervised}. Among tokenizers, BPE achieves the lowest NSL and fertility across most languages, while WordPiece generally produces the highest values, particularly at smaller vocabularies, and SentencePiece Unigram performs in between \cite{sennrich2016neural,wu2016google,kudo2018sentencepiece}. Language typology strongly influences outcomes, as morphologically complex languages show larger efficiency gains with larger vocabularies, whereas morphologically simpler languages like Yoruba and Swahili remain relatively stable \cite{bostrom2020byte,wang2019neural}. Overall, the results highlight that larger vocabularies provide more efficient tokenization across all algorithms, reducing sequence length and subword fragmentation, which can lower computational overhead in downstream tasks \cite{qiu2020pre}.


\begin{table*}[htbp]
\centering
\caption{NSL and Subword Fertility across tokenizers, languages, and vocabulary sizes.}
\label{tab:nsl_subword_fertility1}
\resizebox{\linewidth}{!}{%
\begin{tabular}{llcccccc}
\toprule
\textbf{Language} & \textbf{Tokenizer} & \multicolumn{2}{c}{\textbf{15k voc size}} & \multicolumn{2}{c}{\textbf{30k voc size}} & \multicolumn{2}{c}{\textbf{50k voc size}} \\
\cmidrule(lr){3-4} \cmidrule(lr){5-6} \cmidrule(lr){7-8}
& & \textbf{NSL} & \textbf{Subword Fertility} & \textbf{NSL} & \textbf{Subword Fertility} & \textbf{NSL} & \textbf{Subword Fertility} \\
\midrule
\multirow{3}{*}{Yoruba} 
& BPE & 0.4584 & 1.9911 & 0.3993 & 1.7278 & 0.3668 & 1.5783 \\
& WordPiece & 0.8137 & 3.5557 & 0.4584 & 1.9867 & 0.3827 & 1.6498 \\
& SentencePiece Unigram & 0.5471 & 2.3723 & 0.4178 & 1.8040 & 0.3963 & 1.7086 \\
\midrule
\multirow{3}{*}{Arabic} 
& BPE & 0.4908 & 2.8232 & 0.3726 & 2.1427 & 0.3316 & 1.9056 \\
& WordPiece & 0.8353 & 4.8084 & 0.4485 & 2.5773 & 0.3567 & 2.0482 \\
& SentencePiece Unigram & 0.4859 & 2.7878 & 0.3879 & 2.2264 & 0.3427 & 1.9706 \\
\midrule
\multirow{3}{*}{Mandarin Chinese} 
& BPE & 0.8479 & 8.7353 & 0.7438 & 7.6769 & 0.6985 & 7.2173 \\
& WordPiece & 0.8391 & 8.4954 & 0.7628 & 7.7758 & 0.6786 & 6.9250 \\
& SentencePiece Unigram & 0.9393 & 9.6524 & 0.8556 & 8.7974 & 0.8166 & 8.4003 \\
\midrule
\multirow{3}{*}{Russian} 
& BPE & 0.4621 & 3.3589 & 0.3367 & 2.4322 & 0.2990 & 2.1634 \\
& WordPiece & 0.8701 & 6.3412 & 0.4406 & 3.1862 & 0.3276 & 2.3668 \\
& SentencePiece Unigram & 0.4879 & 3.5411 & 0.3414 & 2.4549 & 0.3091 & 2.2264 \\
\midrule
\multirow{3}{*}{Hindi} 
& BPE & 0.4727 & 2.5893 & 0.3566 & 1.9619 & 0.3258 & 1.8028 \\
& WordPiece & 0.7793 & 4.1668 & 0.4210 & 2.2823 & 0.3377 & 1.8444 \\
& SentencePiece Unigram & 0.5174 & 2.8608 & 0.3819 & 2.1243 & 0.3497 & 1.9463 \\
\midrule
\multirow{3}{*}{Japanese} 
& BPE & 0.7739 & 11.0098 & 0.5978 & 8.4776 & 0.5290 & 7.4856 \\
& WordPiece & 0.9168 & 13.1019 & 0.7082 & 10.0753 & 0.5775 & 8.1898 \\
& SentencePiece Unigram & 0.8368 & 11.8092 & 0.6681 & 9.3931 & 0.5973 & 8.3685 \\
\midrule
\multirow{3}{*}{Swahili} 
& BPE & 0.4008 & 2.6293 & 0.2953 & 1.9350 & 0.2620 & 1.7163 \\
& WordPiece & 0.8573 & 5.6392 & 0.3759 & 2.4664 & 0.2910 & 1.9076 \\
& SentencePiece Unigram & 0.4068 & 2.6739 & 0.3031 & 1.9849 & 0.2632 & 1.7273 \\
\midrule
\multirow{3}{*}{Bengali} 
& BPE & 0.4270 & 2.9301 & 0.2897 & 1.9807 & 0.2479 & 1.6934 \\
& WordPiece & 0.8569 & 5.8917 & 0.3897 & 2.6693 & 0.2759 & 1.8865 \\
& SentencePiece Unigram & 0.4621 & 3.1692 & 0.2958 & 2.0243 & 0.2512 & 1.7167 \\
\midrule
\multirow{3}{*}{Turkish} 
& BPE & 0.4295 & 3.3339 & 0.3166 & 2.4487 & 0.2807 & 2.1667 \\
& WordPiece & 0.8805 & 6.8531 & 0.3937 & 3.0524 & 0.3050 & 2.3543 \\
& SentencePiece Unigram & 0.4482 & 3.4741 & 0.3262 & 2.5227 & 0.2826 & 2.1694 \\
\bottomrule
\end{tabular}%
}
\end{table*}





\subsection{POS Tagging Results}


We further compare tokenizers on POS tagging across vocabulary sizes using accuracy and F1 metrics. From Table ~\ref{tab:POS}, it can be inferred that wordPiece achieves the highest overall performance, with a test accuracy of 0.7830 and weighted F1 of 0.7722 at 15k, consistently outperforming BPE and SentencePiece Unigram. This can be attributed to WordPiece’s ability to preserve morphologically meaningful units, which benefits POS tagging where syntactic boundaries are crucial \cite{straka2016udpipe}. In contrast, BPE prioritizes frequency-based merges, often splitting or merging across morpheme boundaries, which reduces efficiency for this task despite shorter sequences \cite{sennrich2016neural,kudo2018subword}. SentencePiece Unigram shows intermediate behavior, offering slightly higher macro-F1 than BPE but lacking the stability of WordPiece \cite{kudo2018sentencepiece}. Notably, increasing vocabulary size does not improve results and in some cases reduces accuracy, as larger vocabularies can overspecialize subword units and lose the generalization capacity needed for POS tagging \cite{nivre2016universal,kann2016single,mielke2021between}.



\begin{table*}[htbp]
\centering
\caption{POS Performance comparison across tokenizers, vocabulary sizes, and metrics. Best values per block are highlighted in bold.}
\label{tab:POS}
\resizebox{\linewidth}{!}{
\begin{tabular}{llccccccc}
\toprule
\textbf{Tokenizer} & \textbf{Voc size} & \textbf{Epoch} & \textbf{Train Acc} & \textbf{Train F1 Macro} & \textbf{Train F1 Weighted} & \textbf{Test Acc} & \textbf{Test F1 Macro} & \textbf{Test F1 Weighted} \\
\midrule
{BPE} & \multirow{3}{*}{15k} & 3 & 0.7847 & 0.2957 & 0.7722 & 0.7207 & 0.2973 & 0.7057 \\
{WordPiece}& & 3 & \textbf{0.8413} & \textbf{0.3889} & \textbf{0.8325} & \textbf{0.7830} & \textbf{0.3952} & \textbf{0.7722} \\
{SentencePiece Unigram}& & 3 & 0.8067 & 0.3209 & 0.7947 & 0.7484 & 0.3299 & 0.7340 \\
\midrule
{BPE} & \multirow{3}{*}{30k} & 3 & 0.7526 & 0.2686 & 0.7366 & 0.6932 & 0.2724 & 0.6735 \\
{WordPiece} & & 3 & \textbf{0.7964} & 0.2877 & \textbf{0.7839} & \textbf{0.7325} & 0.2915 & \textbf{0.7174} \\
{SentencePiece Unigram} & & 3 & 0.7752 & \textbf{0.3141} & 0.7625 & 0.7211 & \textbf{0.3218} & 0.7061 \\
\midrule
{BPE} & \multirow{3}{*}{50k} & 3 & 0.7643 & 0.2719 & 0.7484 & 0.7015 & 0.2775 & 0.6812 \\
{WordPiece}& & 3 & \textbf{0.8001} & 0.2848 & \textbf{0.7880} & \textbf{0.7335} & 0.2883 & \textbf{0.7187} \\
{SentencePiece Unigram}& & 3 & 0.7662 & \textbf{0.3085} & 0.7528 & 0.7107 & \textbf{0.3141} & 0.6944 \\
\bottomrule
\end{tabular}
}
\end{table*}


\subsection{NER Results}

Table ~\ref{tab:NER} presents the NER performance across tokenizers, vocabulary sizes, and evaluation metrics. With 15k vocabulary size, WordPiece significantly outperforms BPE and SentencePiece Unigram, achieving the highest Test F1 (0.5844) and Test Accuracy (0.7644). This suggests that WordPiece is particularly effective in low-vocabulary regimes, where its ability to balance word-level and subword-level information aids entity boundary recognition \cite{devlin2019bert,wu2016google,li2020unified}. As vocabulary size increases to 30k and 50k, BPE shows competitive performance, especially in Test Accuracy (0.7032 at 50k), while SentencePiece occasionally surpasses BPE in terms of F1 score. However, WordPiece maintains overall superiority, albeit with diminishing margins, likely due to reduced fragmentation and more stable subword segmentation as vocabulary size grows \cite{klein2017opennmt}.

Overall, these results indicate that WordPiece offers the best generalization for NER at smaller vocabulary sizes, while BPE and SentencePiece Unigram become more competitive at larger vocabularies, reflecting the trade-off between segmentation granularity and contextual representation in sequence labeling tasks \cite{peters2018deep,akbik2018contextual,mielke2021between}.



\begin{table*}[htbp]
\centering
\caption{NER Performance comparison across tokenizers, vocabulary sizes, and metrics}
\resizebox{\linewidth}{!}{
\begin{tabular}{@{}lcccccccccc@{}}
\toprule
\textbf{Tokenizer} & \textbf{Voc size} & \textbf{Epoch} & \textbf{Train Precision} & \textbf{Train Recall} & \textbf{Train F1} & \textbf{Train Accuracy} & \textbf{Test Precision} & \textbf{Test Recall} & \textbf{Test F1} & \textbf{Test Accuracy} \\
\midrule
BPE & 15k & 3 & 0.3113 & 0.6453 & 0.4200 & 0.6878 & 0.2452 & 0.5088 & 0.3309 & 0.6324 \\
WordPiece & & 3 & \textbf{0.5990} & \textbf{0.8601} & \textbf{0.7062} & \textbf{0.8374} & \textbf{0.4937} & \textbf{0.7157} & \textbf{0.5844} & \textbf{0.7644} \\
SentencePiece Unigram & & 3 & 0.4949 & 0.8398 & 0.6228 & 0.7856 & 0.3769 & 0.6441 & 0.4756 & 0.7102 \\
\midrule
BPE & 30k & 3 & 0.3730 & 0.7452 & 0.4971 & 0.7604 & 0.2640 & 0.5393 & 0.3545 & 0.6828 \\
WordPiece & & 3 & \textbf{0.4432} & \textbf{0.8106} & \textbf{0.5731} & \textbf{0.7898} & \textbf{0.3251} & \textbf{0.6063} & \textbf{0.4233} & \textbf{0.7014} \\
SentencePiece Unigram & & 3 & 0.4120 & 0.7810 & 0.5394 & 0.7449 & 0.2920 & 0.5690 & 0.3860 & 0.6532 \\
\midrule
BPE & 50k & 3 & 0.3951 & 0.7533 & 0.5183 & 0.7902 & 0.2650 & 0.5290 & 0.3531 & 0.7032 \\
WordPiece & & 3 & \textbf{0.4266} & 0.7842 & 0.5525 & \textbf{0.7927} & \textbf{0.3034} & \textbf{0.5767} & \textbf{0.3976} & \textbf{0.7153} \\
SentencePiece Unigram & & 3 & 0.4258 & \textbf{0.7885} & \textbf{0.5530} & 0.7599 & 0.2914 & 0.5515 & 0.3813 & 0.6694 \\
\bottomrule
\end{tabular}
}
\label{tab:NER}
\end{table*}



\subsection{Machine Translation Results}

Table ~\ref{tab:MT} presents the performance of different tokenization methods on multilingual BART-large across vocabulary sizes of 15k, 30k, and 50k. BPE with 15k vocabulary achieved the best BLEU score (0.1226) due to its efficient segmentation of rare words into frequent subword units, balancing vocabulary compactness with representational power \cite{sennrich2016neural}. WordPiece performed poorly (BLEU = 0.0103) because its tendency to favor longer subwords led to insufficient coverage of morphologically rich words in multilingual data \cite{wu2016google}. SentencePiece Unigram produced shorter prediction lengths, suggesting under-segmentation effects.
At 30k vocabulary, WordPiece achieved BLEU of 0.1136, nearly matching BPE (0.1135). SentencePiece Unigram showed the highest Exact Match (0.086) despite lower BLEU, indicating that its probabilistic subword sampling captured more precise token boundaries, though its aggressive segmentation may reduce fluency in longer sequences \cite{mielke2021between}. For 50k vocabulary, SentencePiece Unigram achieved the highest Exact Match (0.096) while WordPiece obtained the strongest BLEU score (0.1218). This suggests WordPiece benefits from larger vocabularies by covering more lexical items directly, improving overall fluency \cite{devlin2019bert,wu2016google}, whereas Unigram maintains precision in token alignment but at the cost of sequence length imbalance \cite{kudo2018sentencepiece}. BPE underperformed as vocabulary grew, likely due to oversplitting that reduced sequence-to-sequence alignment efficiency.
These results suggest BPE is effective for smaller vocabularies, WordPiece scales better with larger vocabularies, and SentencePiece Unigram excels in exact matching but generates shorter sequences. The performance differences stem from how each tokenizer balances subword granularity, vocabulary coverage, and sequence length, which directly impact BLEU and Exact Match metrics \cite{post2018call}.



\begin{table*}[htbp]
\centering
\caption{Performance comparison of different tokenization methods on multilingual BART-large. 
Best scores per vocabulary size are in \textbf{bold}.}
\label{tab:MT}
\scalebox{0.80}{
\begin{tabular}{llcccc}
\toprule
\textbf{Voc Size} & \textbf{Tokenizer Type}  & 
\textbf{BLEU} & \textbf{Exact Match} & \textbf{Avg. Pred Len} & \textbf{Avg. Label Len} \\
\midrule
\multirow{3}{*}{15k} 
& BPE          & \textbf{0.1226} & \textbf{0.069} & 27.41 & 23.05 \\
& WordPiece    & 0.0103 & 0 & 20 & 35.11 \\
& SentencePiece Unigram & 0.0526 & 0.031 & 9.14 & 7.37 \\
\midrule
\multirow{3}{*}{30k} 
& BPE          & 0.1135 & 0.006 & 17.99 & 18.55 \\
& WordPiece    & \textbf{0.1136} & 0.067 & 23.30 & 21.65 \\
& SentencePiece Unigram & 0.0966 & \textbf{0.086} & 7.58 & 7.37 \\
\midrule
\multirow{3}{*}{50k} 
& BPE          & 0.1039 & 0.055 & 22.35 & 16.97 \\
& WordPiece    & \textbf{0.1218} & 0.078 & 19.94 & 17.96 \\
& SentencePiece Unigram & 0.1039 & \textbf{0.096} & 7.13 & 7.37 \\
\bottomrule
\end{tabular}}
\end{table*}


\begin{figure*}
	\centering
    \includegraphics[width=0.8\textwidth]{Machine Translation.png}
	% \caption{The bar charts compare the performance of different tokenization methods (BPE, WordPiece, and SentencePiece Unigram) across vocabulary sizes of 15k, 30k, and 50k using BLEU and Exact Match metrics. For BLEU, BPE achieves the highest score at 15k (0.1226), while WordPiece surpasses at larger vocabularies (30k and 50k), reaching 0.1136 and 0.1218 respectively. SentencePiece shows consistently lower BLEU scores but performs more stably across sizes. In Exact Match, SentencePiece stands out at higher vocabularies, peaking at 0.096 with 50k, while BPE dominates at 15k (0.069). WordPiece demonstrates balanced performance, improving steadily with larger vocab sizes. Overall, the results highlight that smaller vocabularies favor BPE for BLEU, while larger vocabularies shift the advantage to WordPiece and SentencePiece, particularly in terms of Exact Match accuracy.}
    \caption{BLEU and Exact Match performance of tokenization methods across vocabulary sizes on multilingual machine translation, showing vocabulary-dependent performance patterns for each algorithm.}
	\label{fig:fig1}
\end{figure*}

\section{Discussion and Conclusion}


This study systematically investigated the impact of balanced multilingual datasets on tokenizer performance across nine typologically diverse languages, revealing significant insights into how dataset composition affects both tokenization efficiency and downstream task performance. Our findings demonstrate that balanced training data substantially improves tokenization fairness, with BPE consistently achieving the lowest Normalized Sequence Length and Subword Fertility values across most languages, indicating more compact tokenization through its frequency-based merging strategy. Notably, logographic languages like Mandarin Chinese and Japanese showed dramatic improvements with larger vocabularies, with NSL values decreasing from 0.85 to 0.70 for Chinese when moving from 15k to 50k vocabulary sizes. In downstream tasks, tokenizer choice proved highly task-dependent: WordPiece excelled in POS tagging (accuracy: 0.7830) and NER (F1: 0.5844) due to its ability to preserve morphologically meaningful boundaries, while BPE performed best in machine translation at smaller vocabularies (BLEU: 0.1226 at 15k). These results provide empirical evidence that balanced datasets enable tokenizers to learn more representative subword units across diverse writing systems, reducing the over-segmentation typically observed in low-resource languages and addressing computational inequities where underrepresented languages can incur up to 68\% additional processing costs.

While our results are promising, several limitations must be acknowledged. Future research should address these limitations by expanding evaluation to character-level and neural tokenization methods and exploring intermediate balancing strategies tailored to specific language families. We plan to extend our framework to generative tasks where tokenizer choice may have different implications for output quality and fairness, and develop dynamic balancing strategies that adapt to evolving multilingual corpora. Multi-tokenizer approaches within single models represent a promising avenue for leveraging different tokenization strategies simultaneously across languages or tasks.

This study establishes the critical importance of balanced training data in achieving fair and efficient multilingual tokenization. While optimal tokenizer choice remains task-dependent, balanced datasets consistently improve performance across all evaluated conditions. By bridging tokenizer efficiency and linguistic fairness, this work contributes to inclusive language technologies that serve diverse global communities with computational equity, moving beyond one-size-fits-all paradigms toward more equitable multilingual NLP systems.

\appendix

\section{Computational Resources}

All experiments in this study were conducted using two distinct computational environments. Tokenizer training for all nine languages across three vocabulary sizes (15k, 30k, 50k) and three algorithms (BPE, WordPiece, Unigram), along with Part-of-Speech tagging and Named Entity Recognition model training, were performed using Google Colab with Tesla T4 GPU (16GB VRAM) and Python 3.10 with CUDA 11.8. Machine Translation experiments using multilingual BART-large required enhanced computational capacity and were conducted on a local workstation equipped with NVIDIA GeForce RTX 4090 (24GB VRAM) and CUDA 11.8. The choice of computational environments was determined by the memory requirements of each task, with the larger BART-large models for machine translation necessitating the higher VRAM capacity of the RTX 4090 GPU.



\bibliographystyle{sn-mathphys-ay}
\bibliography{references}



\end{document}
