\begin{table}[h]
\footnotescript
  \caption{Self-improvement over four iterations of the STaPLe algorithm, compared against the STaR baseline (SFT without the principle operating as a latent CoT between the initial and refined attempts). Note that the SFT sample counts for iterations 2-4 differ as the principles are discovered by different models -- the STaR Iter 1 and STaPLe Iter 1 models, respectively.}
  \label{table:multiple-iters}
  \centering
  \begin{tabular}{ccccccl}
    \toprule
    Model    & MT-Bench (avg)  & MT-Bench (T1) & MT-Bench (T2) & AlpacaEval & IFEval WR \\
    \midrule
    \textbf{Llama-3.1-8B-Instruct} & & & & & \\
    \midrule
    Initial Policy & 7.46 & 8.09 & 6.83 & 26.9 & -- \\
    \midrule
    STaR Iter 1 (28.2k)  & 7.43  & 8.04  & 6.81 & 29.1 & 55.5\%   \\
    STaPLe Iter 1 (28.2k) & 7.66 & 8.15  & 7.16 & 32.2 &  65.6\%   \\
    \midrule
    STaR Iter 2 (6.0k)  & 7.47 & 8.08 &  6.86 & 30.6 & 57.7\% \\
    STaPLe Iter 2 (6.1k) & 7.74 & 8.19 & 7.29 & 34.4 & 66.2\% \\
    \midrule
    STaR Iter 3 (6.1k)  & 7.51 & 8.10 & 6.91 & 31.5 & 61.0\% \\
    STaPLe Iter 3 (6.3k) & \textbf{7.74} & \textbf{8.16} & \textbf{7.31} & \textbf{35.6} & 68.8\%  \\
    \midrule
    STaR Iter 4 (6.3k)  & 7.56 & 8.11 & 7.00 & 31.8 & 62.3\% \\
    STaPLe Iter 4 (6.6k) & 7.71 & 8.13 & 7.30 & 33.4 &  \textbf{68.9\%} \\
    \midrule
    \midrule
    \textbf{Granite-3.1-8B-Instruct} & & & & & \\
    \midrule
    Initial Policy & 7.83 & 8.59 & 7.08 & 30.2 & -- \\
    \midrule
    STaR Iter 1 (24.1k) & 7.83  & 8.61  & 7.05 & 33.0 & 57.3\%  \\
    STaPLe Iter 1 (24.1k) & 7.99 & 8.69  & 7.29 & 36.7 &   65.1\% \\
    \midrule
    STaR Iter 2 (5.4k)  & 7.86 & 8.63 & 7.10 & 34.7 & 59.5\% \\
    STaPLe Iter 2 (5.2k) & 8.04 & 8.74 & 7.34 & 38.9 & 65.2\% \\
    \midrule
    STaR Iter 3 (5.9k)  & 7.92 & 8.66 & 7.18 & 35.4 & 61.9\% \\
    STaPLe Iter 3 (5.9k) & \textbf{8.06} & \textbf{8.75} & 7.38 & \textbf{39.8} & \textbf{71.6\%} \\
    \midrule
    STaR Iter 4 (6.2k)   & 7.96 & 8.68 & 7.25 & 35.6 & 62.1\% \\
    STaPLe Iter 4 (6.3k) & 8.04 & 8.69 & \textbf{7.41} & 38.4 & 67.6\%  \\
    \midrule
    \midrule
    \textbf{Qwen2.5-7B-Instruct} & & & & & \\
    \midrule
    Initial Policy & 6.83 & 7.34 & 6.31 & 30.4 & -- \\
    \midrule
    STaR Iter 1 (30.9k) & 6.85  & 7.39  & 6.31 & 34.5 &  61.0\%  \\
    STaPLe Iter 1 (30.9k) & 7.03 & 7.48  & 6.59 & 37.3 & 68.2\%     \\
    \midrule
    STaR Iter 2 (6.5k)  & 6.98 & 7.45& 6.51 & 36.9 & 63.0\% \\
    STaPLe Iter 2 (6.5k) & 7.14 & 7.55 & 6.73 & 39.4 & 66.2\% \\
    \midrule
    STaR Iter 3 (7.1k)   & 7.08 & 7.58 & 6.59 & 37.6 & 66.4\% \\
    STaPLe Iter 3 (7.0k) & 7.20 & 7.63 & 6.78 & 39.8 & 72.5\%  \\
    \midrule
    STaR Iter 4 (7.1k)  & 7.14 & 7.63 & 6.66 & 37.8 & 68.4\% \\
    STaPLe Iter 4 (7.1k) & \textbf{7.24} & \textbf{7.64} & \textbf{6.85} & \textbf{40.2} & \textbf{73.4\%} \\
    \bottomrule
  \end{tabular}
\end{table}


