

\newcommand{\rocauc}{\multicolumn{1}{l}{AUC}}
\newcommand{\ap}{\multicolumn{1}{l|}{AP}}

\begin{table*}[t!]
\centering
\resizebox{\textwidth}{!}{
{\fontsize{12pt}{15pt}\selectfont
\setlength{\tabcolsep}{3.5pt}
\begin{tabular}{l|ll|ll|ll}
\toprule
& \multicolumn{2}{l|}{\textbf{Pneumonia}} & \multicolumn{2}{l|}{\textbf{Colorectal Cancer}} & \multicolumn{2}{l}{\textbf{Diabetic Retinopathy}} \\
\textbf{Method} & \rocauc & \ap & \rocauc & \ap & \rocauc & \multicolumn{1}{l}{AP} \\
\midrule
\midrule
\multicolumn{7}{l}{\em Llama-3.2-11B-Vision-Instruct} \\
\midrule
Direct Answering & 0.50 & 0.62 & 0.56 & 0.65 & 0.61 & 0.63 \\
Direct Answering (\scaling) & 0.74\gain{+24.2pp} & 0.79\gain{+16.9pp} & 0.56\gain{+0.5pp} & 0.66\gain{+0.3pp} & \textbf{0.71\gain{+10.0pp}} & 0.74\gain{+11.1pp} \\
\midrule
One-stage CoT & 0.53 & 0.64 & 0.48 & 0.62 & 0.58 & 0.61 \\
One-stage CoT (\scaling) & 0.78\gain{+24.9pp} & 0.83\gain{+18.8pp} & 0.53\gain{+5.4pp} & 0.64\gain{+2.4pp} & 0.67\gain{+9.2pp} & \textbf{0.74\gain{+13.4pp}} \\
\midrule
Two-stage Reasoning & 0.52 & 0.63 & 0.54 & 0.65 & 0.57 & 0.61 \\
Two-stage Reasoning (\scaling) & \textbf{0.82\gain{+30.4pp}} & \textbf{0.86\gain{+22.8pp}} & \textbf{0.65\gain{+10.9pp}} & \textbf{0.75\gain{+10.6pp}} & \textbf{0.71\gain{+13.5pp}} & \textbf{0.74\gain{+13.0pp}} \\

\bottomrule
\end{tabular}
}
}
\vspace{1em}
\caption{Results on MedMNIST datasets: PneumoniaMNIST (pneumonia detection), PathMNIST (colorectal cancer classification), and RetinaMNIST (diabetic retinopathy detection). 
We compare \textbf{baselines} (direct answering, one-stage CoT, and two-stage reasoning) with their \textbf{TTS} variants for $N=16$. 
Across all datasets, applying TTS yields substantial improvements, with two-stage reasoning + TTS achieving the best overall performance. 
Largest gains observed for two-stage reasoning, up to 30.4 percentage points (pp).
Metrics reported are AUC (area under the ROC curve) and AP (area under the precision–recall curve).}
\label{tab:comparison}
\end{table*}
