
\begin{table*}[t!]
\centering
\resizebox{\textwidth}{!}{
{\fontsize{12pt}{15pt}\selectfont
% \setlength{\tabcolsep}{5pt}
\begin{tabular}{l|llllll}
\toprule
\makecell[{{l}}]{~ \\ \textbf{Method}} & \makecell[{{l}}]{\textbf{Clinical} \\ \textbf{Knowledge}} & \makecell[{{l}}]{\textbf{Medical} \\ \textbf{Genetics}} & \makecell[{{l}}]{\textbf{Anatomy}} & \makecell[{{l}}]{\textbf{Professional} \\ \textbf{Medicine}} & \makecell[{{l}}]{\textbf{College} \\ \textbf{Biology}} & \makecell[{{l}}]{\textbf{College} \\ \textbf {Medicine}} \\
\midrule
\midrule
\multicolumn{7}{l}{\em Llama-3.1-8B-Instruct} \\
\midrule
Direct Answering & 0.71 & 0.75 & 0.62 & 0.73 & 0.75 & 0.64 \\
Direct Answering (\scaling) & 0.72\gain{1.3pp} & 0.78\gain{3.5pp} & 0.65\gain{2.8pp} & 0.77\gain{3.6pp} & 0.77\gain{2.3pp} & 0.67\gain{3.1pp} \\
\midrule
One-stage CoT & 0.71 & 0.77 & 0.66 & 0.71 & 0.73 & 0.65 \\
One-stage CoT (\scaling) & \textbf{0.80\gain{9.2pp}} & \textbf{0.84\gain{7.6pp}} & \textbf{0.72\gain{5.7pp}} & \textbf{0.85\gain{14.3pp}} & \textbf{0.84\gain{11.2pp}} & \textbf{0.77\gain{11.8pp}} \\
\midrule
\midrule
\multicolumn{7}{l}{\em DeepSeek-R1-Distill-Llama-8B} \\
\midrule
Direct Answering & 0.52 & 0.55 & 0.48 & 0.46 & 0.54 & 0.47 \\
Direct Answering (\scaling) & 0.56\gain{3.8pp} & 0.62\gain{6.8pp} & 0.51\gain{3.4pp} & 0.57\gain{10.6pp} & 0.62\gain{8.1pp} & 0.52\gain{5.0pp} \\
\midrule
One-stage CoT & 0.61 & 0.63 & 0.53 & 0.58 & 0.65 & 0.58 \\
One-stage CoT (\scaling) & \textbf{0.73\gain{12.1pp}} & \textbf{0.80\gain{17.5pp}} & \textbf{0.64\gain{11.3pp}} & \textbf{0.72\gain{14.4pp}} & \textbf{0.80\gain{15.2pp}} & \textbf{0.74\gain{15.5pp}} \\

\bottomrule
\end{tabular}
}
}
\vspace{1em}
\caption{Accuracy on six medical domains of MMLU using different prompting strategies. 
We compare \textbf{baselines} (direct answering and one-stage chain-of-thought (CoT)) with our \textbf{test-time scaling (TTS)} variants for $N=64$. 
Across both \textsc{Llama-3.1-8B-Instruct} and \textsc{DeepSeek-R1-Distill-Llama-8B}, applying TTS consistently improves performance over their respective baselines, with the largest gains observed for one-stage CoT, up to 17.5 percentage points (pp).}
\label{tab:comparison_mmlu}
\end{table*}