% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{Reflective Translation: Enhancing Low-Resource Machine Translation through
Self-Reflection}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Lailah Denny\orcidID{0009-0006-0928-956X} \and
Nicholas Cheng \and
Agrim Sharma
\and
Erin Tan
}
%
\authorrunning{Denny et al.}
\titlerunning{Reflective Translation for Low-Resource MT}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Algoverse AI Research}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Low-resource languages such as isiZulu and isiXhosa face persistent challenges in machine translation (MT) due to limited parallel corpora and scarce linguistic resources. Recent work on large language models (LLMs) suggests that self-reflection—the ability of a model to critique and revise its own outputs—has been shown to enhance reasoning and factual consistency. Building on this idea, we present a framework for \textit{Reflective Translation}, wherein an LLM internally evaluates and corrects its own translations to improve semantic fidelity by employing multi-round prompting. We apply our method using GPT-3.5 and Claude Haiku 3.5 on English–isiZulu and English–isiXhosa pairs from the OPUS-100 and NTREX-African datasets. To assess translation quality, we compute BLEU and COMET scores. We find that \textit{Reflective Translation} yields consistent improvements in translation quality from the first to the second pass, across both isiZulu (+0.08 BLEU, +0.13 COMET) and isiXhosa (+0.07 BLEU, +0.09 COMET). We further introduce a first-of-its-kind reflection-augmented dataset built from model-generated self-critiques and corrected translations. Overall, this paper demonstrates reflection-based prompting as a promising approach for enhancing data quality and improving MT in under-resourced languages, bridging the gap between LLM reasoning research and practical translation for global linguistic inclusion.
\end{abstract}
%
%
%
\section{Introduction}

Machine Translation (MT) is a fundamental task for global communication, enabling users to exchange information across languages without the need for human intermediaries. The effectiveness of MT systems depends on their linguistic, factual, and logical faithfulness. Recently, large language models (LLMs) have emerged as powerful translation engines, demonstrating strong performance on task-specific benchmarks without additional fine-tuning \cite{brants2007large,moslem-etal-2023-adaptive}. Despite these advances, there remains a substantial gap in LLM performance for low-resource languages \cite{robinson-etal-2023-chatgpt,haddow-etal-2022-survey}.

Low-resource languages suffer from limited high-quality labeled data, which constrains models in learning incomplete distributions that do not capture full linguistic and sociocultural variation \cite{pavamind}. Under these conditions, LLM-based translators are particularly prone to hallucinations, omissions, and culturally biased renderings \cite{wang2020exposure}. To mitigate these challenges, researchers have developed multilingual pretraining strategies such as mBART and mRASP, which extend coverage and improve cross-lingual transfer \cite{pan2021contrastive}. Although these methods improve robustness, they still struggle to ensure semantic faithfulness and contextual grounding in low-resource settings.

An emerging line of research explores \emph{self-reflection}—prompting a model to critique and refine its own outputs—as a pathway to improve reasoning quality and factuality. Iterative prompting frameworks such as Reflexion \cite{shinn2023reflexion} enhance reasoning through verbal feedback loops, while Self-Refine \cite{madaan2023selfrefine} achieves similar gains through multi-round self-revision. The Chain-of-Verification framework \cite{creswell2023chainofverification} further decomposes generation into explicit verification and synthesis stages, providing an interpretable reasoning scaffold. Beyond inference-time prompting, Reflection-Tuning \cite{li2023reflectiontuning} and the recently proposed ReflectionLLMMT model \cite{wang2024reflectionllmmt} demonstrate that reflection signals can be incorporated into model training or translation pipelines to improve consistency and accuracy.

In this work, we investigate whether reflective checking can be leveraged as an explicit reasoning step to improve translation fidelity without multi-round prompting or model fine-tuning. We conceptualize translation as a form of constrained reasoning, where each target segment must faithfully represent the semantics of the source text. To operationalize this, we design a reflection-guided translation framework in which an LLM first produces an initial translation, then generates a structured self-reflection identifying characteristic failure modes (e.g., mistranslation, omission, distortion) and concise corrective guidance. Unlike prior reflection-based approaches that depend on retraining or direct synthetic data generation, our method embeds reflection directly into the prompting process, enabling test-time self-correction.

Empirical evaluation of reasoning-intensive translation benchmarks shows that the introduction of self-reflection significantly improves both the COMET and BLEU scores over standard prompting baselines, without any additional fine-tuning or training. Our results align with previous findings that reflective reasoning improves factual consistency and robustness in generation \cite{shinn2023reflexion,li2023reflectiontuning}, while differing from concurrent approaches such as \cite{wang2024reflectionllmmt}, which integrate reflection into supervised translation fine-tuning rather than prompt-level control.


Concretely, our contributions are as follows:
\begin{itemize}
    \item We propose a novel reflection-guided prompting framework for machine translation, where models generate and act upon structured self-assessments to improve translation faithfulness. 
    \item We perform empirical evaluation on two multilingual and low-resource translation datasets (OPUS-100 and NTREX-African) across two different LLMs (GPT-3.5 and Claude Haiku 3.5).
\end{itemize}

\section{Methods}

We propose Reflective Translation, an approach in which a model is guided to self-review its own translations to produce improved outputs based on structured feedback. For each source sentence, GPT-3.5 \cite{openai2023gpt3.5} and Claude Haiku 3.5 \cite{anthropic2024claude35} first generate an initial translation, which is then evaluated against predefined BLEU \cite{papineni2002bleu} and COMET \cite{rei2020comet} thresholds. If the translation does not meet these criteria, the model produces a structured reflection that identifies key errors, suggests concise corrections, and highlights critical phrases or factual details to preserve.

Each reflection consists of three components: identifying key errors in the initial translation, suggesting reusable high-level corrections, and highlighting critical phrases or content that must be preserved. This reflection informs the generation of a second, refined translation, enhancing both semantic fidelity and fluency without requiring external feedback or additional parallel data.

To prevent leakage from the reflection into the second translation, key content words are extracted and masked using the Rapid Automatic Keyword Extraction (RAKE) algorithm, implemented in the NLTK library~\cite{rake}. Key phrases are replaced with the \texttt{<MASK>} token, ensuring that the model relies on comprehension rather than copying. Second-attempt translations are generated based on these masked reflections, guided by structured feedback. BLEU and COMET scores are computed for the improved translations to quantify the impact of reflection-informed guidance, forming the final reflection dataset used in subsequent experiments.

The framework is designed to be generalizable across low-resource languages and compatible with any large language model capable of following structured prompts. It can also incorporate supplementary guidance strategies, such as few-shot examples or threshold-based performance criteria, to further improve reliability and consistency. In this work, we apply the reflective translation method to English–isiZulu and English–isiXhosa corpora.

\begin{figure}[h]
    \includegraphics[width=0.9\linewidth]{figures/figure3.pdf}
    \caption{Overview of the reflective translation framework. The model generates an initial translation, evaluates it using structured reflection, and produces a refined translation guided by key error corrections and masked content words.}
    \label{fig:architecture}
\end{figure}


\subsection{Datasets}

We evaluate our approach using two datasets: \textit{OPUS-100} \cite{tiedemann2012parallel}, which provides broad multilingual coverage across diverse languages and domains, and \textit{NTREX-African} \cite{ntrex2023}, which contains sentence-level data spanning general topics in African languages. Our experiments focus on the low-resource languages isiZulu and isiXhosa, enabling assessment of model performance in challenging translation scenarios. Specifically, we use OPUS-100 for English–isiZulu translation and NTREX-African for English–isiXhosa translations.

OPUS-100 is a large-scale parallel corpus covering 100 language pairs, constructed primarily from web-crawled sources including government documents, subtitles, and domain-diverse online texts. The English–isiZulu subset contains approximately 250,000 parallel sentences. NTREX-African, in contrast, provides professionally curated evaluation sets for several African languages, including isiXhosa, with roughly 40,000 sentence pairs, collected from educational and media sources to ensure linguistic quality and coverage.

IsiZulu and isiXhosa are Bantu languages spoken predominantly in South Africa. IsiZulu is the most widely spoken language in the country, with roughly 12 million native speakers, while isiXhosa has around 8 million native speakers. Despite this, both languages are considered low-resource in NLP due to the limited availability of high-quality parallel corpora and computational resources. This scarcity makes it challenging to train and evaluate machine translation models, highlighting the need for methods like reflective translation that can improve performance without relying on massive datasets.

Both datasets differ in their language coverage and construction: OPUS-100 offers large parallel corpora across 100 language pairs, including English–isiZulu, derived from web-crawled and domain-diverse sources. NTREX-African, in contrast, provides a smaller but professionally curated evaluation set for several African languages, including isiXhosa. Using each dataset for the language it most robustly supports ensures that our experiments rely on the highest-quality and most complete parallel data available for each translation direction.

\subsection{Baseline}

We establish baseline translation performance using GPT-3.5 and Claude 3.5 without fine-tuning. For each source sentence, the model produces an initial translation, which is then reviewed via reflection-informed prompting. This reflection identifies key errors and provides concise corrective guidance, which the model uses to generate a second, refined translation. In addition to zero-shot generation, we conduct few-shot experiments where exemplar translations are included in the prompt, allowing assessment of the impact of in-context examples on translation quality. This setup enables evaluation of both the contribution of reflection and the benefit of few-shot prompting.


\subsection{Evaluation}

Translation quality is assessed using two complementary metrics: BLEU and COMET. BLEU quantifies n-gram overlap between generated and reference translations, providing insight into surface-level lexical accuracy while penalizing overly short outputs. Formally, BLEU is computed as:

\[
    \text{BLEU} = BP \cdot \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)
\]

where $p_n$ is the n-gram precision, $w_n$ are the weights (usually uniform), $N$ is the maximum n-gram order, and $BP$ is the brevity penalty:

\[
BP = 
\begin{cases}
    1 & \text{if } c > r \\
    e^{1-r/c} & \text{if } c \le r
\end{cases}
\]

with $c$ as the candidate translation length and $r$ as the reference length.

COMET is a neural-based metric defined as:

\[
\text{COMET}(x, y) = f_\theta(x, y)
\]

where $x$ is the generated translation, $y$ is the reference translation, and $f_\theta$ is a pretrained multilingual embedding-based model that predicts a human-aligned quality score.

Both first- and second-attempt translations are scored, and averaging across examples provides a robust estimate of baseline performance and the gains attributable to reflection. This dual-metric evaluation captures both literal accuracy and semantic preservation, which is critical for low-resource language settings.

OPUS-100 and NTREX-African differ in their language coverage and construction: OPUS-100 offers large parallel corpora across 100 language pairs, including English–isiZulu, derived from web-crawled and domain-diverse sources, whereas NTREX-African provides smaller, professionally curated evaluation sets for several African languages, including isiXhosa. Using each dataset for the language it most robustly supports ensures that our experiments rely on the highest-quality and most complete parallel data available for each translation direction.

\section{Results}
Across translation tasks, error thresholds, and prompting strategies, we find that self-reflection consistently improves translation quality. Tables \ref{tab:xh_threshold_combined_small} and \ref{tab:zu_threshold_combined_small} report the BLEU and COMET scores for isiXhosa and isiZulu translations, respectively. Results are reported across five different score thresholds---that is, the minimum acceptable score for first-try translations without executing our self-reflection pipeline. We observe consistent increases in second-try translations after applying Reflective Translation to incorrectly translated samples. Overall, the results show that iterative self-refinement offers a simple but effective way to boost translation quality without requiring 
additional training data or parameter updates.

Across all settings, reflective translation improves second-pass outputs by an average of +0.07 BLEU and +0.18 COMET, with the larger gains appearing under stricter confidence thresholds. These consistent improvements demonstrate that self-reflection reliably enhances translation quality. The plateau in second-pass BLEU at higher thresholds reflects how strict confidence requirements limit the number of sentences eligible for reflection, causing the metric to stabilize as fewer sentences undergo refinement due to the focus on certain translations.

\subsection{Ablations}
\paragraph{Error Threshold.} To understand how confidence filtering interacts with reflection, we perform a threshold ablation over a range of cutoff values.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/ablation_haiku (1).png}
    \caption{Threshold Ablation using Haiku 3.5. Higher thresholds produce fewer translations but lead to larger BLEU and COMET improvements, highlighting the effectiveness of self-reflection under high-confidence predictions.}
    \label{fig:haiku_ablation}
\end{figure}



Lower thresholds increase coverage, but yield smaller quality gains, indicating that reflection has limited impact when the initial translation is highly uncertain.

Taken together, these results show that self-reflection remains helpful across all thresholds, with its strongest benefits appearing under stricter filtering where refinement is most impactful.

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/threshold_ablation (1).jpg}
    \caption{Threshold Ablation using ChatGPT 3.5. Lower thresholds increase coverage but produce smaller gains, highlighting the trade-off between output volume and quality.}
    \label{fig:chatgpt_ablation}
\end{figure}

\paragraph{Prompting Strategies}
In addition to the zero-shot strategy employed in our baseline results, we evaluate two additional prompting strategies to isolate the influence of reasoning structure, in-context signals, and self-correction mechanisms on translation quality. We employ Few-Shot \cite{brown2020languagemodelsfewshotlearners} and Chain-of-Thought (CoT) \cite{wei2023chainofthoughtpromptingelicitsreasoning} prompting. These prompting strategies are incorporated into our reflective-translation pipeline. All prompts are provided in the Appendix.

Chain-of-thought (CoT) prompting encourages the model to articulate intermediate reasoning before producing the final translation. This setup allows us to test whether making the model ``think out loud" helps it navigate structural ambiguity, complex sentence constructions, or subtle semantic nuances. The few-shot prompt presents a small number of input–output examples. These demonstrations give concrete guidance on target style and sentence structure, helping the model align its translations through pattern matching and in-context learning.

\begin{table}[h!]
\centering
\caption{Average scores for prompting techniques using Haiku 3.5. 
BLEU$_1$ and COMET$_1$ correspond to the first attempt, 
while BLEU$_2$ and COMET$_2$ correspond to the second attempt.}

\footnotesize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2} 
\begin{tabular}{|c|c|c|c|c|}
\hline
\multicolumn{5}{|c|}{\textbf{English-isiXhosa}} \\
\hline
Prompt & BLEU$_1$ & COMET$_1$ & BLEU$_2$ & COMET$_2$ \\
\hline
baseline & 0.08414 & 0.57696 & 0.16297 & 0.65792 \\
chain-of-thought & 0.09109 & 0.56773 & 0.11680 & 0.64955 \\
few-shot & 0.08936 & 0.57109 & 0.16970 & 0.69555 \\
\hline
\multicolumn{5}{|c|}{\textbf{English-isiZulu}} \\
\hline
Prompt & BLEU$_1$ & COMET$_1$ & BLEU$_2$ & COMET$_2$ \\
\hline
baseline & 0.16068 & 0.65606 & 0.24389 & 0.78464 \\
chain-of-thought & 0.15195 & 0.64749 & 0.23953 & 0.75492 \\
few-shot & 0.06751 & 0.57770 & 0.28726 & 0.79589 \\
\hline
\end{tabular}
\label{tab:avg_scores_haiku_small}
\end{table}


\vspace{-2mm}

\begin{table}[h!]
\centering
\caption{Average scores for prompting techniques using ChatGPT 3.5. 
BLEU$_1$ and COMET$_1$ correspond to the first attempt, 
while BLEU$_2$ and COMET$_2$ correspond to the second attempt.}

\footnotesize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2} 
\begin{tabular}{|c|c|c|c|c|}
\hline
\multicolumn{5}{|c|}{\textbf{English-isiXhosa}} \\
\hline
Prompt & BLEU$_1$ & COMET$_1$ & BLEU$_2$ & COMET$_2$ \\
\hline
baseline & 0.13351 & 0.63365 & 0.12326 & 0.66553 \\
chain-of-thought & 0.12307 & 0.62451 & 0.10580 & 0.68168 \\
few-shot & 0.11132 & 0.60456 & 0.11571 & 0.66560 \\
\hline
\multicolumn{5}{|c|}{\textbf{English-isiZulu}} \\
\hline
Prompt & BLEU$_1$ & COMET$_1$ & BLEU$_2$ & COMET$_2$ \\
\hline
baseline & 0.16111 & 0.67172 & 0.20321 & 0.76672 \\
chain-of-thought & 0.17722 & 0.66225 & 0.15124 & 0.74844 \\
few-shot & 0.17217 & 0.62550 & 0.11886 & 0.71479 \\
\hline
\end{tabular}
\label{tab:avg_scores_gpt35_small}
\end{table}

\section{Discussion}

Our study demonstrates a significant boost in translation quality for low-resource, morphologically rich languages via structured self-reflection. Across both \textit{isiZulu} and \textit{isiXhosa}, second-pass translations incorporating masked self-critiques consistently outperformed initial outputs, confirming that explicit reflection helps the model identify and correct systematic errors such as agreement mismatches and named-entity distortions. Among prompting strategies, the few-shot reflection setup achieved the most balanced and stable improvements across BLEU and COMET metrics, highlighting the value of in-context exemplars for guiding reflective behavior. Interestingly, self-reflection in the zero-shot setting still surpassed few-shot prompting \textit{without} reflection, suggesting that structured critique provides a stronger inductive bias for faithfulness than mere exposure to exemplars. Additional experiments demonstrated a predictable scaling of performance with stricter adequacy and COMET thresholds, supporting the hypothesis that reflection can serve as a controllable lever for enhancing semantic precision.

\subsection{Comparison to Baselines}

To further validate our approach, we compared Reflective Translation (RT) against two relevant baselines: Self-Refine (SR) \cite{madaan2023selfrefine} and GEPA \cite{agrawal2025gepa}. Self-Refine iteratively refines translations through model self-critique, which aligns closely with the premise of Reflective Translation, while GEPA serves as an additional reference. The results, shown in Table~\ref{tab:baseline_comparison}, demonstrate that RT consistently outperforms both SR and GEPA across BLEU and COMET metrics, with more robust and consistent translation quality across languages. These findings indicate that reflective prompting improves structural constancy and syntactic fidelity in low-resource settings beyond what iterative self-refinement alone can achieve.

\begin{table}[h!]
\centering
\caption{Comparison of Reflective Translation (RT) with Self-Refine (SR) and GEPA baselines.}
\footnotesize
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2} 
\begin{tabular}{|c|c|c|c|}
\hline
\multicolumn{4}{|c|}
{\textbf{BLEU Comparison}} \\
\hline
Method & BLEU(SR) & BLEU(GEPA) & BLEU(RT) \\
\hline
Scores & 0.00726 & 0.01651 & 0.13543 \\
\hline
\multicolumn{4}{|c|}{\textbf{COMET Comparison}} \\
\hline
Method & COMET(SR) & COMET(GEPA) & COMET(RT) \\
\hline
Scores & 0.33204 & 0.40096 & 0.54861 \\
\hline
\end{tabular}
\label{tab:baseline_comparison_results_style}
\end{table}

Overall, this work presents a lightweight, model-agnostic framework for improving translation in low-resource settings through reflection-guided prompting. By embedding structured self-critique into the inference process rather than relying on fine-tuning or additional data collection, this approach offers a practical and reproducible method for enhancing translation fidelity while minimizing annotation costs. Beyond performance gains, our framework also functions as a form of model-based data augmentation—producing interpretable (source, draft, critique, revision) tuples that can support future supervised training and analysis of reflective behavior.

\subsection{Limitations and Future Work}

While our findings highlight the potential for self-reflection to improve translation quality, there are several limitations in our preliminary work. The scope of our experiments is restricted to isiZulu and isiXhosa, which are both members of the Nguni language group, and thus share similarities in grammar structure and phonetics; extended experiments are required on a variety of low-resource languages from independent origins and may not show the ability of Reflective Translation to generalize to other low-resource or morphologically complex languages. Additionally, only two large language models (GPT-3.5 and Claude Haiku 3.5) were evaluated; further studies should include additional model architectures to assess generalizability.  

Moreover, while BLEU and COMET scores effectively capture surface and semantic similarity, they may overlook sociocultural nuances, small grammatical distinctions, and contextual fidelity, particularly in underrepresented languages. Developing richer evaluation protocols that incorporate human judgment could provide deeper insights into translation quality. Future work may also explore broader typological coverage, more adaptable architectures, and expanded reflective prompting strategies to enhance scalability, fairness, and representation in low-resource machine translation.

\bibliographystyle{splncs04}
\bibliography{mybibliography}

\appendix
\onecolumn
\section*{Reference Examples and Prompt Templates for Low-Resource Languages}

\subsection*{Baseline Translation Prompts}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Baseline First-Try Prompt:}\\
            \texttt{%
Source (\{lang\_name\}): \{source\_text\}\\
You are a professional translator. Translate the given text accurately into English. Preserve the original meaning, tone, and nuance.\\
Output format (exact):\\
Translation:\\
<START\_TRANSLATION>\\
<your English translation here>\\
<END\_TRANSLATION>\\
Do NOT include any explanations.
            }
        }
    }
    \caption{Baseline translation prompt for first attempt.}
\end{figure}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Baseline Second-Try Prompt (Reflection-Based):}\\
            \texttt{%
Source (\{lang\_name\}): \{source\_text\}\\
You are a professional translator. Based on the following review and reflection, provide an improved translation.\\
Reflection: \{reflection\}\\
Output format (exact):\\
Translation:\\
<START\_TRANSLATION>\\
<your improved English translation here>\\
<END\_TRANSLATION>\\
Do NOT include explanations.
            }
        }
    }
    \caption{Baseline second attempt with reviewer reflection.}
\end{figure}

\subsection*{Few-Shot Translation Prompts}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Few-Shot First-Try Prompt:}\\
            \texttt{%
Source (\{lang\_name\}): \{source\_text\}\\
You are a professional translator. Translate the following text into English accurately.\\
Here are examples for guidance: \\
Source (isiZulu): Ngiyabonga kakhulu.\\
Translation: Thank you very much.\\
Source (isiZulu): Unjani namhlanje?\\
Translation: How are you today?\\
Output format:\\
Translation:\\
<START\_TRANSLATION>\\
<your English translation here>\\
<END\_TRANSLATION>
            }
        }
    }
    \caption{Few-shot prompt with guiding examples.}
\end{figure}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Few-Shot Second-Try Prompt (Reflection-Based):}\\
            \texttt{%
Source (\{lang\_name\}): \{source\_text\}\\
Improve the English translation of the following text using the review and reflection: \{reflection\}\\
Here are examples for guidance: \\
Reflection: Incorrect verb nuance fixed.\\
Improved Translation: I would like to ask a question.\\
Output format:\\
Translation:\\
<START\_TRANSLATION>\\
<your improved English translation here>\\
<END\_TRANSLATION>
            }
        }
    }
    \caption{Few-shot second attempt with reflection and examples.}
\end{figure}

\subsection*{Brief Reasoning (Chain-of-Thought) Prompts}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Brief Reasoning First-Try Prompt:}\\
            \texttt{%
Translate the following \{lang\_name\} text into English.\\
Before giving the final answer, perform brief internal reasoning. Do NOT reveal your reasoning.\\
Source (\{lang\_name\}): \{source\_text\}\\
Output format:\\
Translation:\\
<START\_TRANSLATION>\\
<your English translation here>\\
<END\_TRANSLATION>
            }
        }
    }
    \caption{First attempt with brief internal reasoning.}
\end{figure}

\begin{figure}[h]
    \centering
    \fbox{%
        \parbox{1\linewidth}{%
            \textbf{Brief Reasoning Second-Try Prompt (Reflection-Based):}\\
            \texttt{%
Improve the English translation of the following \{lang\_name\} text.\\
Use the review and reflection to fix errors. Perform brief internal reasoning but do NOT reveal it.\\
Source (\{lang\_name\}): \{source\_text\}\\
Review and Reflection: \{reflection\}\\
Output format:\\
Translation:\\
<START\_TRANSLATION>\\
<your improved English translation here>\\
<END\_TRANSLATION>
            }
        }
    }
    \caption{Second attempt with brief reasoning and reflection.}
\end{figure}

\onecolumn
\section*{Tables for BLEU and COMET Scores}
\begin{table}[h!]
\centering
\caption{isiXhosa (xh) BLEU and COMET Threshold Ablation (ChatGPT 3.5 / Haiku 3.5)}
\scriptsize
\begin{tabular}{|c|cc|cc|}
\hline
& \multicolumn{2}{c|}{\textbf{BLEU}} & \multicolumn{2}{c|}{\textbf{COMET}} \\
\cline{2-5}
Threshold & First-Try & \textbf{\textit{RT}} & First-Try & \textbf{\textit{Reflective Translation}} \\
\hline
0.40 & 0.050 / 0.062 & \textbf{0.201} / 0.135 & 0.361 / 0.364 & \textbf{0.619} / 0.599 \\
0.50 & 0.056 / 0.075 & \textbf{0.201} / 0.135 & 0.438 / 0.428 & \textbf{0.657} / 0.630 \\
0.60 & 0.064 / 0.084 & \textbf{0.201} / 0.135 & 0.491 / 0.490 & \textbf{0.694} / 0.660 \\
0.70 & 0.070 / 0.091 & \textbf{0.201} / 0.135 & 0.531 / 0.549 & \textbf{0.720} / 0.704 \\
0.80 & 0.070 / 0.096 & \textbf{0.201} / 0.135 & 0.556 / 0.585 & \textbf{0.720} / 0.704 \\
\hline
\end{tabular}
\label{tab:xh_threshold_combined_small}
\end{table}

\begin{table}[h!]
\centering
\caption{isiXhosa (xh) BLEU and COMET Threshold Ablation (ChatGPT 3.5 / Haiku 3.5)}
\scriptsize
\begin{tabular}{|c|cc|cc|}
\hline
& \multicolumn{2}{c|}{\textbf{BLEU}} & \multicolumn{2}{c|}{\textbf{COMET}} \\
\cline{2-5}
Threshold & First-Try & \textbf{\textit{RT}} & First-Try & \textbf{\textit{Reflective Translation}} \\
\hline
0.40 & 0.050 / 0.364 & 0.201 / \textbf{0.599} & 0.361 / 0.364 & \textbf{0.619} / 0.599 \\
0.50 & 0.056 / 0.438 & 0.201 / \textbf{0.657} & 0.438 / 0.428 & \textbf{0.657} / 0.630 \\
0.60 & 0.064 / 0.491 & 0.201 / \textbf{0.694} & 0.491 / 0.490 & \textbf{0.694} / 0.660 \\
0.70 & 0.070 / 0.531 & 0.201 / \textbf{0.720} & 0.531 / 0.549 & \textbf{0.720} / 0.704 \\
0.80 & 0.070 / 0.556 & 0.201 / \textbf{0.720} & 0.556 / 0.585 & \textbf{0.720} / 0.704 \\
\hline
\end{tabular}
\label{tab:zu_threshold_combined_small}
\end{table}

\end{document}