\begin{figure*}[t!]
    \centering
    \subfigure[One-stage CoT (Text)]{
        \includegraphics[width=0.3\textwidth]{figure/ablation_cot_meta-llama_Llama-3.1-8B-Instruct.pdf}
        \label{fig:scaling_a}
    }
    \subfigure[One-stage CoT (Vision)]{
        \includegraphics[width=0.3\textwidth]{figure/ablation_zero_cot.pdf}
        \label{fig:scaling_b}
    }
    \subfigure[Two-Stage (Vision)]{
        \includegraphics[width=0.3\textwidth]{figure/ablation_two.pdf}
        \label{fig:scaling_c}
    }
    \caption{The effect of sample size (N) in TTS setting. Increasing the sample size boosts performance across different datasets and inference methods. \textsc{Llama-3.1-8B-Instruct} and \textsc{Llama-3.2-11B-Vision-Instruct} are used for text and vision tasks, respectively.}
    \label{fig:scaling}
\end{figure*}