\documentclass[runningheads]{llncs}

% --------------------
% PACKAGES
% --------------------
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{siunitx}
\usepackage{enumitem}
\usepackage{mathtools}
\usepackage{microtype}
\usepackage{hyperref}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{tikz}
\usepackage{adjustbox}
\usepackage{cite}
\usepackage{color}
\usepackage{float}       % for [H]
\usepackage{subcaption}  % for subfigures
\usepackage[section]{placeins}  % in the preamble
\usepackage{wrapfig} 



\hypersetup{colorlinks=true, linkcolor=black, citecolor=black, urlcolor=black}
\sisetup{round-mode=places,round-precision=1}

% --------------------
% TITLE / AUTHOR
% --------------------
\title{Advancing NLP Equity: A Secondary Benchmark Evaluation of Multilingual Language Models for Underrepresented Languages}
\titlerunning{Advancing NLP Equity}

\author{
Md Muntaqim Meherab\inst{1}\thanks{Corresponding author: \email{meherab2305101354@diu.edu.bd}} \and
Salman\inst{1} \and
Md. Maruf Billah\inst{2} \and
Kazi Shakkhar Rahman\inst{3} \and
Liza Sharmin\inst{4} \and
Tanvirul Islam\inst{1} \and
Z N M Zarif Mahmud\inst{1} \and
Nuruzzaman Faruqui\inst{1} \and
Sheak Rashed Haider Noori\inst{1} \and
Touhid Bhuiyan\inst{4}
}


\institute{
Daffodil International University, Dhaka, Bangladesh\\
\email{\{meherab2305101354, salman2305101404, tanvirulislam.cse, zarif.cse, headcse, deanfhss, faruqui.swe\}@diu.edu.bd}
\and
North South University, Dhaka, Bangladesh\\
\email{maruf.billah.232@northsouth.edu}
\and
Islamic University of Technology, Gazipur, Bangladesh\\
\email{shakkharrahman@iut-dhaka.edu}
\and
Washington University of Science and Technology, Alexandria, VA, USA\\
\email{touhid.bhuiyan@wust.edu}
}

\authorrunning{M. M. Meherab et al.}


% --------------------
% DOCUMENT
% --------------------
\begin{document}
\maketitle

\begin{abstract}
Recent multilingual language models promise support for ``100+ languages,'' yet speakers of Indigenous and other underrepresented languages still often do not see themselves in these advances.
In this work, we take a deliberately simple, secondary-benchmark perspective: rather than proposing a new model or dataset, we re-evaluate an off-the-shelf multilingual natural language inference (NLI) model on public benchmarks that explicitly include Indigenous languages of the Americas.
Concretely, we use the AmericasNLI benchmark for ten Indigenous languages and XNLI for English and Spanish, and we evaluate the widely used \texttt{joeddav/xlm-roberta-large-xnli} model under a fixed, zero-shot protocol.
Our goal is to answer three questions:
(\emph{i}) How large is the performance gap between high-resource and underrepresented languages under the same model and task?
(\emph{ii}) Are these gaps consistent across languages, or do some communities fare systematically worse than others?
(\emph{iii}) What kinds of qualitative errors arise, and what do they suggest about cultural and linguistic mismatch?
Our experiments reveal a striking discrepancy: while English and Spanish reach almost perfect accuracy on XNLI (around 99.8\% on our runs), the same model averages only about 43\% accuracy across ten Indigenous languages in AmericasNLI, with none exceeding 47\%.
We also show qualitative NLI failures in Quechua that point to difficulties with morphology, idioms, and discourse-level inference.
We argue that even such a simple re-analysis can serve as a low-cost yet high-impact tool for making inequities in multilingual NLP visible, especially for communities that rarely appear in headline benchmarks.
\end{abstract}

\keywords{Multilingual NLP \and Natural language inference \and Low-resource languages \and Indigenous languages \and Language equity \and AmericasNLI \and XLM-R}

\section{Introduction}

Large pretrained language models have reshaped natural language processing (NLP) in just a few years.
Transformer architectures~\cite{vaswani2017attention} and multilingual pretraining~\cite{devlin2019bert,conneau2020xlmr} have led to impressive gains across tasks and languages, and many models now advertise support for ``100+ languages'' out of the box.
From a distance, this can give the impression that the field is on track to serve a linguistically diverse world.

A closer look, however, tells a more uneven story.
A growing body of work shows that most of the benefits of modern NLP are concentrated in a small set of high-resource languages, typically major European and East Asian languages~\cite{joshi2020state,blasi2022}.
Speakers of Indigenous and minoritized languages often see little or no improvement in everyday tools---if such tools exist at all.
This is not just a technical detail; it is part of a broader pattern in which linguistic communities already marginalized in education, media, and governance are left behind in the digital sphere as well.

In this paper, we focus on this gap from a pragmatic angle.
Rather than introducing a new model, we ask a simpler question:
\emph{If we take a widely used multilingual model exactly as it is, and we evaluate it carefully on a benchmark that explicitly includes underrepresented languages, what story do the numbers tell?}
This type of secondary benchmarking is not glamorous.
However, it can be done quickly, it is reproducible, and it can provide clear evidence that is easy to communicate both inside and outside the research community.

We centre our study on \textbf{Natural Language Inference (NLI)}, a standard testbed for higher-level semantic understanding.
We treat NLI as a proxy for whether a language model can handle non-trivial semantics in languages that are rarely represented in its training data.
For languages with very limited NLP resources, even basic NLI competence is far from guaranteed.

Concretely, we evaluate the following setup:

\begin{itemize}
    \item \textbf{Model:} \texttt{joeddav/xlm-roberta-large-xnli}, an XLM-R large model fine-tuned on XNLI for NLI in 15 languages.
    \item \textbf{Low-resource benchmark:} AmericasNLI, an NLI dataset in ten Indigenous languages of the Americas, designed for zero-shot evaluation of large multilingual models.
    \item \textbf{High-resource reference:} XNLI test sets for English and Spanish, two languages explicitly included during fine-tuning.
\end{itemize}

We deliberately \emph{do not} fine-tune or adapt the model.
Instead, we use a uniform, zero-shot inference protocol across all languages.
This allows us to ask: under the same task, architecture, and label space, how differently does the model behave depending on which community's language it is exposed to?

We structure our analysis around three research questions:

\begin{description}
    \item[RQ1:] How well does a widely used multilingual NLI model perform on truly low-resource Indigenous languages compared with high-resource languages?
    \item[RQ2:] Are performance gaps consistent across Indigenous languages, or do some communities experience systematically worse performance than others?
    \item[RQ3:] What qualitative error patterns emerge, and what do they suggest about the model's handling of morphology, idioms, and discourse in these languages?
\end{description}

Our experimental design is intentionally modest, both in scope and in resource demands.
We use only public datasets and a single off-the-shelf model.
The entire evaluation can be reproduced on a single GPU within roughly a day, making it accessible to students, small labs, and community-based researchers who may not have access to large compute clusters.

Despite its simplicity, the results are sobering.
In our runs, the XNLI-fine-tuned model reaches nearly perfect accuracy on English and Spanish (around 99.9\% and 99.6\%, respectively), but averages only about 43\% accuracy across the ten AmericasNLI languages.
Accuracy for individual Indigenous languages hovers in the low-40\% range, with no language above 47\%.
In other words, a model that is effectively ``solving'' NLI in high-resource settings behaves more like a weak baseline when asked to support Indigenous communities.

Beyond the headline numbers, qualitative error analysis for Quechua reveals systematic confusions between contradiction and neutral, as well as a tendency to over-predict entailment in the presence of complex morphology and discourse markers.
These errors align with broader concerns that current multilingual models often underrepresent the structural and cultural diversity of the world's languages~\cite{joshi2020state,bird2022local}.

Our contributions are:

\begin{itemize}
    \item A simple but rigorous secondary-benchmark evaluation of a widely used multilingual NLI model on AmericasNLI and XNLI, quantifying the performance gap between high-resource and Indigenous languages.
    \item A set of language-level and aggregate metrics that make these gaps easy to communicate and compare.
    \item A small but concrete qualitative analysis of NLI failures in Quechua, illustrating how errors connect to morphology, idioms, and discourse.
    \item A reproducible, low-cost evaluation pipeline that can serve as a template for similar audits of other multilingual models and tasks.
\end{itemize}

Our results do not claim to be the final word on fairness in multilingual NLI.
They are, however, a reminder that even small, focused re-evaluations can surface inequities that might otherwise be hidden behind impressive average scores.



\section{Background and Related Work}

\subsection{Multilingual Pretrained Language Models}

Transformer-based pretrained language models such as BERT~\cite{devlin2019bert}
and its multilingual variants have become the backbone of modern NLP\@.
Multilingual BERT (mBERT) and XLM-R~\cite{conneau2020xlmr} extend this
paradigm by jointly pretraining on large corpora spanning dozens or hundreds
of languages. These models have delivered strong cross-lingual transfer on
tasks such as part-of-speech tagging, question answering, and NLI, especially
for languages with reasonable amounts of text in pretraining corpora.

However, pretraining coverage is not synonymous with equitable performance.
Even when a language is technically present in the pretraining mix, it may be
represented by orders of magnitude fewer tokens than English or other
high-resource languages. Recent studies have shown that, in practice,
performance tends to track data availability and socio-economic status of
language communities rather than any intrinsic linguistic property~\cite{blasi2022}.
To measure this imbalance more systematically, benchmarks such as
XTREME~\cite{hu2020xtreme} and its successor XTREME-R~\cite{ruder2021xtremerr}
were developed to span dozens of languages and several task types, and in
doing so they exposed how sharply cross-lingual transfer quality degrades once
a target language falls outside the well-resourced core of the pretraining corpus.
Lauscher et al.~\cite{lauscher2020zero} pushed this finding further, showing
that zero-shot transfer is considerably more brittle than headline numbers
suggest---even modest shifts in morphological complexity or domain can cause
substantial accuracy drops for languages at the lower end of the resource
spectrum.

\subsection{Cross-Lingual NLI Benchmarks}

Natural Language Inference has become a standard testbed for semantic
understanding across languages. XNLI~\cite{conneau2018xnli} extends the
English MultiNLI dataset to 15 languages via translation, providing a
benchmark for evaluating multilingual sentence representations and
cross-lingual transfer. Models like XLM-R are often fine-tuned on XNLI for
these languages and then reused for zero-shot classification in other
settings.

While XNLI has driven substantial progress, its language coverage remains
skewed toward high-resource languages. To address this gap, AmericasNLI
introduces NLI test and validation sets for ten Indigenous languages of the
Americas, including Ash\'{a}ninka, Aymara, Bribri, Guaran\'{i}, Nahuatl,
Otom\'{i}, Quechua, Rar\'{a}muri, Shipibo-Konibo, and Wixarika.
Ebrahimi et al.~\cite{ebrahimi2022americasnli}, who introduced
AmericasNLI, designed it specifically to expose the limits of zero-shot
transfer for languages that had been all but invisible in mainstream
multilingual evaluation---none of the target languages appear in the
fine-tuning data of the models the benchmark is intended to test.
Complementary evidence from Clark et al.~\cite{clark2020tydi} in the
question-answering domain reinforces the same concern: their TyDi QA
benchmark, built around typologically diverse languages, shows that
model performance closely tracks the linguistic distance between a target
language and the high-resource languages that dominate pretraining, making
typological coverage a concrete fairness issue rather than a purely academic one.
Our work builds directly on this line of research. Rather than training new
models, we use AmericasNLI as a lens to examine how an off-the-shelf
XNLI-fine-tuned model behaves when evaluated on these languages alongside
its original high-resource targets.

\subsection{Inequalities in Language Technology}

Several studies have documented systematic inequalities in language technology.
Joshi et al.~\cite{joshi2020state} quantify how a small set of languages
dominate NLP research and resources, while most of the world's languages
receive little or no attention. Blasi et al.~\cite{blasi2022} provide a
global perspective on disparities in performance across languages, linking
them to a combination of resource availability, economic indicators, and
colonial history.

Other work has argued that resource labels such as ``low-resource'' can
obscure important distinctions between standardised, local, and contact
languages, and can carry their own power dynamics~\cite{bird2022local}.
Nekoto et al.~\cite{nekoto2020participatory} offer a practical illustration
of this point through a large-scale, community-led machine translation
initiative for African languages, demonstrating that sustainable progress
for underserved communities depends not just on model improvements but on
genuine participatory involvement of those communities in data creation,
evaluation, and deployment decisions.
From a governance perspective, these disparities have implications for who
gets to shape the development and deployment of language technologies, and
whose values and communicative practices are encoded in them.

Our study sits within this broader conversation but with a narrower
empirical focus. We ask a concrete question about one widely used
multilingual NLI model and two well-defined benchmarks, with the aim of
providing clear, reproducible evidence that can be situated within a larger
body of work on linguistic equity in NLP\@.

\subsection{Secondary Benchmarking and Model Audits}

There is a growing recognition that evaluating existing models under new
conditions can be as valuable as building new models, especially for
fairness and governance. Secondary benchmarking---reusing public models and
datasets to answer new questions---offers an accessible way to conduct such
audits. It lowers the entry barrier for researchers and community members
who may not have resources to train large models but do have important
questions about how those models behave.

In the multilingual setting, secondary benchmarking has been used to
examine gender bias, toxicity, and domain shifts across languages.
Our work follows this spirit: we use standard tools (Hugging Face
Transformers), public benchmarks (AmericasNLI, XNLI), and a simple
evaluation pipeline to highlight how much performance can change when
moving from well-served to underrepresented languages.

Table~\ref{tab:positioning_prior_work} summarises how our study sits in relation to key
strands of prior work on multilingual models, low-resource NLI, and
linguistic equity.

\begin{table*}[t]
\centering
\caption{Positioning Our Work Within Prior Research}
\label{tab:positioning_prior_work}
\renewcommand{\arraystretch}{1.15}
\begin{tabular}{p{0.17\textwidth} p{0.38\textwidth} p{0.38\textwidth}}
\toprule
\textbf{Area} & \textbf{What Existing Work Shows} & \textbf{What Our Study Adds} \\
\midrule
Multilingual pretrained language models &
Large models like mBERT and XLM-R deliver strong results for well-represented languages, but performance tends to follow data availability and global socio-economic patterns. &
We provide a concrete, task-specific demonstration of this imbalance by quantifying how sharply performance drops for Indigenous American languages under identical evaluation conditions. \\
\midrule
Cross-lingual NLI benchmarks &
XNLI has become a standard benchmark but is skewed toward high-resource languages; AmericasNLI was created to fill this gap by providing truly low-resource test sets. &
Instead of proposing new data or models, we treat AmericasNLI as a diagnostic tool, using it to reveal how a commonly deployed XNLI-tuned model behaves when confronted with languages outside its design scope. \\
\midrule
Inequalities in language technology &
Prior studies document global disparities in NLP performance, resource availability, and historical factors shaping linguistic representation. &
We offer a focused, reproducible case study showing how these systemic patterns manifest in a familiar task (NLI), turning abstract inequality findings into specific, measurable outcomes. \\
\midrule
Secondary benchmarking and model audits &
Auditing models under new conditions has become increasingly recognized as a lightweight but meaningful fairness practice. &
We contribute a straightforward audit procedure that others can reuse, emphasizing accessibility for researchers without large computational budgets. \\
\midrule
Evaluation of underrepresented languages &
Much prior work focuses on building models, datasets, or adaptation strategies for low-resource languages. &
Our role is complementary: we highlight the baseline reality of out-of-the-box model behavior, establishing a reference point that any future improvements should exceed. \\
\bottomrule
\end{tabular}
\end{table*}



\section{Task and Problem Setting}

\subsection{Natural Language Inference as a Probe}

We frame our study around \textbf{Natural Language Inference (NLI)}.
Given a \emph{premise} sentence and a \emph{hypothesis} sentence, the task is to predict whether the hypothesis is \emph{entailed} by the premise, \emph{contradicted} by it, or \emph{neutral}.
Even though NLI is an idealised task, it requires models to engage with semantic phenomena such as negation, quantification, lexical relations, and basic world knowledge.

For high-resource languages like English, NLI benchmarks have become mature enough that large pretrained models approach human-level performance.
This makes NLI a useful probe: if a model that ``solves'' NLI in English performs poorly on the same task in an Indigenous language, this suggests that the underlying semantic capabilities have not transferred equitably.

Formally, let $\mathcal{L}$ denote the set of sentences in a given language, and let
$\mathcal{Y} = \{\text{entailment}, \text{neutral}, \text{contradiction}\}$ be the label space.
Each NLI example is a pair $(p, h) \in \mathcal{L} \times \mathcal{L}$ consisting of a premise $p$
and a hypothesis $h$, together with a gold label $y \in \mathcal{Y}$.
A classifier $f_\theta$ (here, our fine-tuned multilingual model) implements a mapping
\begin{equation}
    f_\theta : \mathcal{L} \times \mathcal{L} \rightarrow \mathcal{Y}, \quad
    f_\theta(p, h) = \hat{y},
\end{equation}
where $\hat{y}$ is the predicted NLI label.
In this work, we do not update $\theta$; we evaluate a fixed pretrained $f_\theta$ in a zero-shot setting across languages.


\subsection{Underserved Communities and Indigenous Languages}

We follow prior work in treating speakers of Indigenous languages of the Americas as a concrete example of underserved communities in NLP.
These languages are often underrepresented in educational systems, under-resourced in terms of digital tools, and sometimes endangered in terms of intergenerational transmission.
For many of them, even basic NLP infrastructure---tokenizers, morphological analyzers, or spell-checkers---is still in early stages or entirely absent.

Importantly, these languages are not just data points or test cases.
They encode rich cultural histories, oral traditions, and epistemologies that have developed over centuries.
When language technologies fail to support these languages, they risk reinforcing existing patterns of exclusion from digital spaces.
Our goal in this paper is not to ``fix'' these issues, but to help document them in a way that is concrete, reproducible, and legible to the broader NLP community.

\subsection{Research Questions}

Given this context, our study is guided by three research questions (RQ1--RQ3), which we restate here with slightly more operational detail:

\begin{description}
    \item[RQ1:] \emph{How large is the performance gap between high-resource and Indigenous languages on NLI under a fixed multilingual model and evaluation protocol?}\\
    We measure language-wise accuracy and macro-F1, and compare the mean performance of English and Spanish (high-resource languages included in XNLI fine-tuning) against that of the ten AmericasNLI languages.
    \item[RQ2:] \emph{Do performance gaps vary substantially across Indigenous languages?}\\
    We examine per-language metrics to see whether all languages are equally disadvantaged, or whether some languages consistently receive better or worse support from the model.
    \item[RQ3:] \emph{What kinds of qualitative errors does the model make in an Indigenous language, and what do these errors suggest about its handling of morphology, idioms, and discourse?}\\
    We focus on Quechua as a case study, collecting a small sample of misclassified NLI pairs and analysing where the model tends to over-predict entailment or confuse contradiction with neutral.
\end{description}

These questions are intentionally narrow.
They do not address all dimensions of fairness or usability, but they provide a concrete starting point that others can extend with more sophisticated analyses or additional models.

\section{Methodology}
\label{sec:methodology}



Our methodology is designed around a few principles:
use only public resources; keep the evaluation protocol simple and transparent; and separate high-resource and low-resource settings as clearly as possible.

\subsection{Datasets}

\subsubsection{AmericasNLI}

is an NLI benchmark that extends the XNLI framework to ten Indigenous languages of the Americas.
For each language, it provides development and test splits with a few hundred examples each, derived from parallel annotations aligned with English NLI data.
The target languages include:

Asháninka (\texttt{cni}), Aymara (\texttt{aym}), Bribri (\texttt{bzd}), Guaraní (\texttt{gn}), Nahuatl (\texttt{nah}), Otomí (\texttt{oto}), Quechua (\texttt{quy}), Rarámuri (\texttt{tar}), Shipibo-Konibo (\texttt{shp}), and Wixarika (\texttt{hch}).




In our experiments, we use the test split for each language.
The number of test examples per language ranges from roughly 738 to 750, following the statistics provided with the dataset.
We treat these languages as representative examples of underrepresented communities in current NLP practice: they are typologically diverse, historically marginalized, and largely absent from mainstream training corpora.

\subsubsection{XNLI}

For a high-resource reference point, we use the XNLI test sets for English and Spanish.
XNLI extends MultiNLI to 15 languages via translation and serves as a standard benchmark for cross-lingual NLI.
Crucially, the \texttt{joeddav/xlm-roberta-large-xnli} model we evaluate has been fine-tuned on XNLI for a subset of these languages, including English and Spanish.
We treat performance on these two languages as an approximate upper bound for what the model can achieve under our evaluation protocol.

\subsection{Model}

We evaluate a single off-the-shelf model:

\begin{itemize}
    \item \textbf{Model:} \texttt{joeddav/xlm-roberta-large-xnli}
    \item \textbf{Base architecture:} XLM-R large~\cite{conneau2020xlmr}
    \item \textbf{Fine-tuning:} Trained on XNLI for NLI in 15 languages
\end{itemize}

XLM-R itself is a multilingual masked language model pretrained on over 100 languages with the RoBERTa-style objective.\\
The \texttt{joeddav/xlm-roberta-large-xnli} checkpoint adds a classification head and fine-tuning on XNLI, making it a widely used backbone for zero-shot cross-lingual NLI and text classification.

This model is a natural choice for our study for two reasons.
First, it is popular in practice; many applied systems rely on it, often under the assumption that it ``supports'' a wide range of languages out of the box.
Second, its fine-tuning data explicitly excludes the Indigenous languages in AmericasNLI, making it a realistic example of a model that is powerful in high-resource contexts but not intentionally designed for the underrepresented languages we care about.

\subsection{Evaluation Protocol}

We adopt a uniform, zero-shot evaluation protocol across all languages and datasets.
There is no fine-tuning or adaptation on AmericasNLI.

\paragraph{Input encoding.}
For each (premise, hypothesis) pair, we use the Hugging Face XLM-R tokenizer to encode the sentence pair with a maximum sequence length of 128 tokens.
We rely on the standard sentence-pair encoding format used in \texttt{transformers}, which internally handles segment embeddings as appropriate for XLM-R.

\paragraph{Model inference.}
We run batched inference using a batch size of 32 and single-GPU acceleration.
For each batch, we obtain the model's logits over the three NLI labels (entailment, neutral, contradiction).
We then take the argmax to obtain the predicted class for each example.

\paragraph{Metrics.}
For each language and dataset, we report two standard classification metrics:

\begin{itemize}
    \item \textbf{Accuracy}: the proportion of test examples for which the predicted label matches the gold label.
    \item \textbf{Macro-F1}: the unweighted average F1 score across the three classes, which helps reduce the impact of any label imbalance.
\end{itemize}

To support RQ1 and RQ2, we also compute aggregate mean accuracy for two groups:

\begin{itemize}
    \item \textbf{High-resource group}: the English and Spanish XNLI test sets.
    \item \textbf{Low-resource group}: the ten AmericasNLI languages.
\end{itemize}

Formally, given a test set $\{(p_i, h_i, y_i)\}_{i=1}^N$ for a particular language,
with gold labels $y_i \in \mathcal{Y}$ and model predictions
$\hat{y}_i = f_\theta(p_i, h_i)$, the accuracy is
\begin{equation}
    \mathrm{Acc}
    = \frac{1}{N} \sum_{i=1}^{N} \mathbf{1}\{\hat{y}_i = y_i\},
\end{equation}
where $\mathbf{1}\{\cdot\}$ is the usual indicator function.

Let $\mathcal{C} = \{\text{entailment}, \text{neutral}, \text{contradiction}\}$ denote
the set of classes.
For each class $c \in \mathcal{C}$, we define precision $P_c$
and recall $R_c$ in the standard way from true positives, false positives,
and false negatives.
The macro-averaged F1 score is then
\begin{equation}
    \mathrm{Macro\mbox{-}F1}
    = \frac{1}{|\mathcal{C}|} \sum_{c \in \mathcal{C}}
      \frac{2 P_c R_c}{P_c + R_c + \varepsilon},
\end{equation}
where $\varepsilon$ is a small constant to avoid division by zero.


\paragraph{Error analysis.}
For RQ3, we focus on Quechua (\texttt{quy}) as a case study.
We run the same inference procedure and then collect a small sample of misclassified examples.
For each, we record the premise, hypothesis, gold label, and predicted label, and we manually inspect them to identify recurring patterns such as:

\begin{itemize}
    \item Confusions between contradiction and neutral.
    \item Systematic over-prediction of entailment.
    \item Apparent difficulties with morphology or idiomatic expressions.
\end{itemize}

\subsection{Implementation Details}

All experiments are implemented in Python using the Hugging Face \texttt{datasets} and \texttt{transformers} libraries.
We load AmericasNLI and XNLI directly from Hugging Face, and we rely on the model hub to fetch the \texttt{joeddav/xlm-roberta-large-xnli} checkpoint.
We fix a random seed for data loading to ensure reproducibility, although the evaluation itself is deterministic given the trained model.

The entire pipeline---from data loading to metric computation---can be run in a single Colab notebook with GPU support.

% =========================================================
% 4. Experiments and Results
% =========================================================

\section{Experiments and Results}
\label{sec:experiments}

In this section, we report our zero-shot evaluation of \textsc{XLM-R}$_\text{large}$ (fine-tuned on XNLI) on the AmericasNLI benchmark and the XNLI test sets for English and Spanish. We focus on three aspects: (i) the overall performance gap between high- and low-resource languages, (ii) variation across Indigenous languages, and (iii) qualitative error patterns that shed light on cultural and morpho-syntactic mismatches.

\subsection{Experimental Setup}

We follow the protocol outlined in Section ~\ref{sec:methodology}. For each language in AmericasNLI, we use the official test split and run the pretrained \\\texttt{joeddav/xlm-roberta-large-xnli} model in a pure zero-shot setting, without any further fine-tuning or adaptation. For XNLI, we evaluate on the English and Spanish test sets to approximate an upper bound for high-resource, in-distribution languages.

Unless otherwise noted, we use a maximum sequence length of 128 subword tokens and a batch size of 32. Longer sequences are truncated from the end. We compute accuracy and macro-averaged F1 (\emph{macro-F1}) over the three NLI labels (entailment, neutral, contradiction). All predictions are obtained with a single forward pass per example.

\subsection{Per-language Performance on AmericasNLI}

Table~\ref{tab:per-language} shows per-language performance on AmericasNLI. Accuracy ranges from roughly 40\% to 47\% across the ten Indigenous languages, with macro-F1 tracking accuracy closely. In other words, even the best-performing languages are just a bit above the random baseline of 33.3\% for a balanced three-way classification task.

\begin{table}[t]
\centering
\small
\begin{tabular}{llrcc}
\hline
Language & Code & \# Test & Acc. & Macro-F1 \\
\hline
Aymara          & aym & 750 & 0.421 & 0.378 \\
Bribri          & bzd & 750 & 0.443 & 0.419 \\
Asháninka       & cni & 750 & 0.416 & 0.390 \\
Guaraní         & gn  & 750 & 0.460 & 0.428 \\
Wixarika        & hch & 750 & 0.407 & 0.362 \\
Nahuatl         & nah & 738 & 0.463 & 0.446 \\
Otomí           & oto & 748 & 0.434 & 0.407 \\
Quechua         & quy & 750 & 0.417 & 0.388 \\
Shipibo-Konibo  & shp & 750 & 0.467 & 0.456 \\
Rarámuri        & tar & 750 & 0.403 & 0.352 \\
\hline
\end{tabular}
\caption{Zero-shot NLI performance of \textsc{XLM-R}$_\text{large}$ on AmericasNLI. Accuracy and macro-F1 are computed over three labels: entailment, neutral, and contradiction.}
\label{tab:per-language}
\end{table}

The model does slightly better on Shipibo-Konibo (0.467 accuracy, 0.456 macro-F1) and Nahuatl (0.463 / 0.446), and slightly worse on Rarámuri (0.403 / 0.352) and Wixarika (0.407 / 0.362). Overall, the spread between the best and worst Indigenous languages is modest (about six percentage points of accuracy), and the entire band is far from what would normally be considered ``usable'' performance in high-stakes applications.

Figure~\ref{fig:acc-by-lang} visualizes this distribution. The bar plot makes the story easy to see at a glance: all Indigenous languages cluster just above chance level, with none approaching high-resource performance.

\begin{figure*}[!t]
    \centering
    % Better to avoid spaces/parentheses in file names; rename if possible.
    \includegraphics[width=\textwidth]{Fig/language_performance_figure.pdf}
    \caption{Zero-shot accuracy of \textsc{XLM-R}$_\text{large}$ on each AmericasNLI language. All languages sit in a narrow band around 43\% accuracy, only slightly above the random baseline of 33.3\%.}
    \label{fig:acc-by-lang}
\end{figure*}


\subsection{High- vs. Low-resource Comparison}

To understand how large the gap is between high- and low-resource languages for the \emph{same} model and \emph{same} task, we compare AmericasNLI results with XNLI results for English and Spanish (Table~\ref{tab:aggregate}). On XNLI, the model achieves almost perfect accuracy: 0.999 for English and 0.996 for Spanish. Averaged over these two languages, the mean accuracy is approximately 0.998.


For the ten AmericasNLI languages, the mean accuracy is 0.4331. The difference between high- and low-resource groups is therefore about 56.4 percentage points. That is, the \emph{same} widely deployed model behaves almost like a reliable semantic reasoner for English and Spanish, and like a barely-above-chance classifier for Indigenous languages of the Americas.

\begin{table*}[t]
\centering
\small
\begin{tabular}{lcc}
\hline
Group & Languages & Mean Acc. \\
\hline
High-resource (XNLI) & en, es & 0.998 \\
Low-resource (AmericasNLI) & 10 langs & 0.433 \\
\hline
\end{tabular}
\caption{Aggregate zero-shot accuracy of \textsc{XLM-R}$_\text{large}$ on XNLI (English, Spanish) and AmericasNLI (10 Indigenous languages).}
\label{tab:aggregate}
\end{table*}

We can make the high- vs.\ low-resource comparison a bit more explicit.
Let $\mathcal{L}_{\mathrm{high}}$ be the set of high-resource languages
(in our case, English and Spanish from XNLI), and let
$\mathcal{L}_{\mathrm{low}}$ be the set of low-resource languages
(the ten AmericasNLI languages).
For each language $\ell$, let $\mathrm{Acc}(\ell)$ denote the test accuracy
of $f_\theta$ on that language.
We define group-wise mean accuracies as
\begin{equation}
\begin{aligned}
    \overline{\mathrm{Acc}}_{\mathrm{high}}
        &= \frac{1}{|\mathcal{L}_{\mathrm{high}}|}
           \sum_{\ell \in \mathcal{L}_{\mathrm{high}}} \mathrm{Acc}(\ell), \\
    \overline{\mathrm{Acc}}_{\mathrm{low}}
        &= \frac{1}{|\mathcal{L}_{\mathrm{low}}|}
           \sum_{\ell \in \mathcal{L}_{\mathrm{low}}} \mathrm{Acc}(\ell).
\end{aligned}
\end{equation}
The performance gap we highlight can then be written as
\begin{equation}
    \Delta_{\mathrm{Acc}}
    = \overline{\mathrm{Acc}}_{\mathrm{high}}
      - \overline{\mathrm{Acc}}_{\mathrm{low}}.
\end{equation}
In our runs, $\overline{\mathrm{Acc}}_{\mathrm{high}} \approx 0.998$ and
$\overline{\mathrm{Acc}}_{\mathrm{low}} \approx 0.433$, so
$\Delta_{\mathrm{Acc}} \approx 0.564$.


We also plot this contrast directly in Figure~\ref{fig:agg-bar}. The visual gap between the two bars is a concrete reminder that current multilingual models are far from offering a fair distribution of semantic understanding across languages.

 

\begin{figure}[t] 
    \centering \includegraphics[width=0.80\linewidth]{Fig/group_mean_accuracy.pdf} \caption{Mean zero-shot accuracy of \textsc{XLM-R}$_\text{large}$ for high-resource languages (English (en), Spanish (es)) vs.\ low-resource AmericasNLI languages.} \label{fig:agg-bar} \end{figure}
\subsection{Qualitative Error Analysis}
\label{subsec:qual-analysis}

To understand model failures beyond aggregate metrics, we manually inspect a small sample of misclassified Quechua examples (\texttt{quy}) from the AmericasNLI test set (five cases, sampled from our evaluation outputs). A consistent pattern is over-predicting entailment under high lexical overlap, even when the hypothesis adds unsupported details (gold: neutral) or reverses polarity (gold: contradiction). We also observe confusion between contradiction and neutral when negation or evidential markers shift commitment in ways that are not captured by shallow lexical cues. Overall, these errors suggest the model often relies on surface overlap rather than tracking the underlying entailment relation, consistent with limited task-specific exposure to Indigenous morpho-syntax and discourse markers during fine-tuning.


% =========================================================
% 5. Discussion
% =========================================================

\section{Discussion}
\label{sec:discussion}

We now return to our three research questions and ask what the numbers mean for equity and fairness in multilingual NLP.

\subsection{RQ1: How Well Does the Model Perform on Low-resource Languages?}

On XNLI, \textsc{XLM-R}$_\text{large}$ essentially solves the task: near-perfect accuracy and macro-F1 for English and Spanish. On AmericasNLI, by contrast, the average accuracy is only 43.3\%, with all languages tightly clustered between roughly 40\% and 47\%.

From a purely machine learning perspective, one might say the model is ``above chance'' and move on. From a user-centered perspective, however, these numbers mean that for speakers of Indigenous American languages, a model that appears state-of-the-art in English behaves more like a rough heuristic than a dependable tool.

\subsection{RQ2: Are There Systematic Performance Gaps Across Languages?}

The group-level gap of around 56 percentage points between high- and low-resource languages is hard to ignore. It mirrors, in a concrete NLI setting, what prior work has reported at scale: language technologies work best for languages with large digital footprints and leave many other communities behind.

Within the AmericasNLI group, variation is relatively small. Shipibo-Konibo and Nahuatl sit at the high end, Rarámuri and Wixarika at the low end, but the spread is only about six percentage points. This suggests that the main issue is not fine-grained linguistic typology, but the fact that none of these languages were treated as first-class targets during model development.

\subsection{RQ3: What Do Error Patterns Tell Us About Cultural and Linguistic Mismatch?}

The Quechua examples show that the model struggles precisely where local morpho-syntax and pragmatics carry much of the meaning: negation, evidentiality, and subtle shifts in commitment or stance. It tends to over-predict entailment when there is high lexical overlap, even when the hypothesis is neutral or contradictory, and to over-predict contradiction when it encounters explicit negation without tracking what is actually being negated.

For speakers, this is more than an abstract optimization issue. Misclassifying contradictions as entailments effectively asserts that two statements agree when they in fact conflict. Collapsing neutral into contradiction can create an impression of disagreement where none exists. When such behaviours systematically affect already underserved communities, they become fairness concerns rather than mere model quirks.

\subsection{Implications for Evaluation and Model Selection}

Our results also speak to how we evaluate and select multilingual models. If we only look at averages over high-resource languages, or macro-averages over mostly well-supported languages, severe disparities can remain hidden. A model may be ``state-of-the-art'' on a popular benchmark and still perform at near-chance level for entire language families.

In this sense, AmericasNLI serves as a useful stress test: it exposes the limits of zero-shot transfer for languages that have, until recently, been almost invisible in large-scale NLP evaluation. For practitioners, the message is simple but important: whenever possible, we should inspect how a multilingual model behaves on truly low-resource languages, not just on those that dominate existing leaderboards.


% =========================================================
% 6. Limitations and Future Work
% =========================================================

\section{Limitations and Future Work}
\label{sec:limitations}

This study is intentionally narrow. We focus on a single task (natural language inference), a single benchmark (AmericasNLI), and one widely used multilingual model (\textsc{XLM-R}$_\text{large}$ fine-tuned on XNLI). This makes the setup easy to reproduce and interpret, but also means our conclusions should be read with caution.

First, we do not adapt or fine-tune the model on Indigenous languages. Prior work shows that continued pretraining or translation-based approaches can substantially improve performance on AmericasNLI. Our goal here is different: to quantify out-of-the-box behavior in a realistic zero-shot setting. Future work should compare a range of adaptation strategies through the same fairness lens.

Second, we analyze only a small set of qualitative errors for one language (Quechua). A fuller picture would require systematic human analysis across all ten languages, ideally involving fluent speakers and community members rather than external annotators alone.

Third, our evaluation is purely text-based. Many real-world uses of NLP in Indigenous communities involve speech, code-switching, non-standard orthographies, or multimodal input. The gap we observe on clean written NLI data is therefore likely an optimistic estimate.

Finally, we do not attempt to model the social or historical forces that produced the current distribution of resources and benchmarks. Our work sits downstream of those structural inequalities; addressing them will require collaboration well beyond the technical NLP community.

% =========================================================
% 7. Ethical Considerations
% =========================================================
\section{Ethical Considerations}
\label{sec:ethics}

\subsection{Working with Indigenous Languages}

AmericasNLI was created with care, with documented data collection and annotation. Even so, work with Indigenous languages must attend to consent, representation, and benefit sharing. In this study we rely only on the publicly released splits and do not scrape additional data or train new models from scratch.

Our analysis also remains at arm's length from the communities whose languages are represented. We do not claim to speak for them or fully capture the social meaning of the sentences. Any deployment of models for these languages should involve collaboration with community members, local experts, and relevant institutions.

\subsection{Risks of Misuse and Misinterpretation}

A first risk is that poor performance is misread as evidence that Indigenous languages are ``too hard'' or ``not worth supporting.'' The opposite is true: the gap we see reflects design choices and resource allocation in the NLP ecosystem, not properties of the languages themselves.

A second risk is over-confidence in models for high-resource languages. Near-perfect accuracy on XNLI for English and Spanish does not mean these models are unbiased or harmless. They can still encode and amplify stereotypes, treat social groups unfairly, or fail in unexpected ways outside their original training domains.

\subsection{Fairness and Responsibility}

From a fairness perspective, the core message is simple: current multilingual language models offer very different levels of service depending on which language a person speaks. This inequality is the outcome of choices about which languages to include in pretraining, which benchmarks to prioritize, and which adaptation methods to develop.

As researchers and practitioners, we have a responsibility to make these disparities visible and to push for evaluation practices that do not quietly ignore underserved communities. Benchmarks like AmericasNLI are one part of that work; building inclusive datasets, governance frameworks, and community partnerships is the longer-term task.

% =========================================================
% 8. Conclusion
% =========================================================

\section{Conclusion}
\label{sec:conclusion}

We set out to answer a simple question: what happens when a widely used multilingual NLI model, one that performs nearly perfectly on English and Spanish, is asked to handle truly low-resource Indigenous languages?

Using AmericasNLI as our testbed, we found that \textsc{XLM-R}$_\text{large}$ in a zero-shot setting hovers just above chance for all ten Indigenous languages, with an average accuracy of 43.3\%. In sharp contrast, the same model reaches almost 99.8\% mean accuracy on XNLI for English and Spanish. The resulting gap of about 56 percentage points is a clear illustration of the linguistic digital divide.

Our qualitative analysis of Quechua error cases suggests that the model especially struggles with morpho-syntactic features and pragmatic nuances that were not part of its fine-tuning regime. In those regions of the space, it falls back on shallow lexical cues and produces label assignments that are often at odds with human intuition.

From a technical point of view, these results are unsurprising: it is hard to do well on languages that receive little or no explicit supervision. From a human point of view, however, they are a reminder that talk of ``multilingual'' models can easily obscure who actually benefits from the technology. If we care about NLP equity, we need evaluation setups that foreground underserved languages and make performance gaps impossible to ignore.

We hope that this small, focused study can serve as a compact, reproducible example for students and practitioners who want to engage seriously with fairness in multilingual NLP, even when working under tight time or resource constraints.


% --------------------
% BIBLIOGRAPHY
% --------------------
\bibliographystyle{splncs04}
\bibliography{ref}

\end{document}
