
\begin{table*}[t!]
\centering
\resizebox{\textwidth}{!}{
{\fontsize{12pt}{15pt}\selectfont

\begin{tabular}{l|llllll}
\toprule
\makecell[{{l}}]{~ \\ \textbf{Method}} & \makecell[{{l}}]{\textbf{Clinical} \\ \textbf{Knowledge}} & \makecell[{{l}}]{\textbf{Medical} \\ \textbf{Genetics}} & \makecell[{{l}}]{\textbf{Anatomy}} & \makecell[{{l}}]{\textbf{Professional} \\ \textbf{Medicine}} & \makecell[{{l}}]{\textbf{College} \\ \textbf{Biology}} & \makecell[{{l}}]{\textbf{College} \\ \textbf{Medicine}} \\
\midrule
\midrule
\multicolumn{7}{l}{\em Llama-3.2-1B-Instruct} \\
\midrule
Direct Answering & 0.35 & 0.34 & 0.41 & 0.31 & 0.35 & 0.33 \\
Direct Answering (\scaling) & \textbf{0.41\gain{6.2pp}} & \textbf{0.39\gain{4.7pp}} & \textbf{0.47\gain{6.5pp}} & \textbf{0.38\gain{6.1pp}} & \textbf{0.40\gain{4.4pp}} & \textbf{0.39\gain{5.8pp}} \\
\midrule
One-stage CoT & 0.16 & 0.16 & 0.21 & 0.15 & 0.18 & 0.17 \\
One-stage CoT (\scaling) & 0.04\loss{12.1pp} & 0.02\loss{14.1pp} & 0.08\loss{12.8pp} & 0.01\loss{13.9pp} & 0.05\loss{12.5pp} & 0.07\loss{9.6pp} \\
\midrule
\midrule
\multicolumn{7}{l}{\em Llama-3.2-3B-Instruct} \\
\midrule
Direct Answering & 0.60 & 0.65 & 0.57 & 0.68 & 0.65 & 0.54 \\
Direct Answering (\scaling) & \textbf{0.64\gain{3.7pp}} & \textbf{0.70\gain{4.4pp}} & \textbf{0.64\gain{6.6pp}} & \textbf{0.77\gain{8.7pp}} & \textbf{0.70\gain{5.1pp}} & \textbf{0.55\gain{0.8pp}} \\
\midrule
One-stage CoT & 0.45 & 0.49 & 0.48 & 0.39 & 0.38 & 0.38 \\
One-stage CoT (\scaling) & 0.57\gain{12.4pp} & 0.66\gain{17.1pp} & 0.62\gain{14.5pp} & 0.50\gain{11.2pp} & 0.45\gain{7.8pp} & 0.45\gain{7.2pp} \\

\end{tabular}
}
}
\vspace{1em}
\caption{Accuracy on six medical domains of MMLU using different prompting strategies. 
We compare \textbf{baselines} (direct answering and one-stage chain-of-thought (CoT)) with their \textbf{TTS} variants using $N=64$. 
For \textsc{Llama-3.2-1B-Instruct}, CoT prompting substantially degrades performance, and applying TTS further amplifies this degradation. 
For \textsc{Llama-3.2-3B-Instruct}, CoT lowers baseline accuracy, but TTS recovers performance, yielding consistent improvements.
These results suggest TTS is most effective when models achieve non-trivial accuracy (above random guessing, i.e., $\sim$25\% for four-choice questions); otherwise, scaling may reinforce biased or uninformative reasoning.}

\label{tab:comparison_mmlu_small}
\end{table*}