 \sf Llama-3.1-8b & Base & 1.06 & 3.01 & 1.32 & 84.54 & 0.17 & 16.01 & 3.21 & 11.58 \\
 & CtE & 16.20 & 100.00 & \ditto & \ditto & 8.05 & 55.25 & \ditto & 50.09 \\ \midrule
 \sf GPT-oss-20b & Base & 13.51 & 34.78 & 54.55 & 65.38 & 11.11 & 33.38 & 21.46 & 32.12 \\
 & CtE & 55.88 & 100.00 & \ditto & \ditto & 50.00 & 74.50 & \ditto & 80.90 \\ \midrule
 \sf GPT-4.1-nano & Base & 21.08 & 46.43 & 36.36 & 65.93 & 11.83 & 43.23 & 16.67 & 39.49 \\
 & CtE & 99.41 & 73.50 & \ditto & \ditto & 56.50 & 99.71 & \ditto & 72.72 \\ \midrule
 \sf Mistral-small:24b & Base & 20.41 & 50.42 & 45.28 & 66.67 & 23.46 & 45.42 & 33.33 & 51.40 \\
 & CtE & 26.00 & 100.00 & \ditto & \ditto & 56.91 & 64.90 & \ditto & 81.06 \\ \midrule
 \sf Llama-3.1-70b & Base & 6.25 & 28.70 & 23.74 & 40.00 & 22.22 & 26.46 & 17.00 & 47.39 \\
 & CtE & 16.67 & 100.00 & \ditto & \ditto & 57.52 & 46.07 & \ditto & 86.09 \\ \midrule
 \sf Gemini-2.0-flash & Base & 28.12 & 43.18 & 32.84 & 51.28 & 14.29 & 56.97 & 12.86 & 45.45 \\
 & CtE & 42.86 & 100.00 & \ditto & \ditto & 85.25 & 55.66 & \ditto & 91.98 \\ \midrule
 \sf GPT-4.1-mini & Base & 22.22 & 51.65 & 36.84 & 50.00 & 28.00 & 50.59 & 37.72 & 47.75 \\
 & CtE & 52.83 & 63.47 & \ditto & \ditto & 75.47 & 80.86 & \ditto & 91.40 \\ \midrule
 \sf GPT-4o & Base & 48.48 & 53.17 & 46.30 & 25.00 & 28.71 & 56.77 & 75.00 & 51.64 \\
 & CtE & 64.71 & 99.50 & \ditto & \ditto & 94.12 & 43.37 & \ditto & 97.87 \\ \midrule
 \sf GPT-4.1 & Base & 21.21 & 50.00 & 30.77 & 0.00 & 25.00 & 42.59 & 68.06 & 37.38 \\
 & CtE & 45.45 & 70.00 & \ditto & \ditto & 82.35 & 78.50 & \ditto & 92.73 \\ \midrule
 \sf Grok-3-mini & Base & 20.00 & 51.61 & 47.62 & 42.86 & 5.26 & 47.73 & 29.75 & 33.28 \\
 & CtE & 44.44 & 100.00 & \ditto & \ditto & 57.89 & 73.36 & \ditto & 86.46 \\ \midrule
 \sf DeepSeek-V3.1 & Base & 20.83 & 46.25 & 41.05 & 50.00 & 22.81 & 41.98 & 14.06 & 45.65 \\
 & CtE & 54.55 & 100.00 & \ditto & \ditto & 83.92 & 74.09 & \ditto & 91.83 \\ \midrule
 \sf Gemini-2.5-flash & Base & 19.51 & 34.62 & 45.45 & 57.89 & 11.11 & 49.53 & 21.45 & 36.10 \\
 & CtE & 58.54 & 100.00 & \ditto & \ditto & 62.07 & 82.66 & \ditto & 86.86 \\ \midrule
 \sf GPT-5-nano & Base & 57.14 & 58.89 & 52.94 & 57.97 & 47.17 & 68.94 & 40.02 & 68.06 \\
 & CtE & 50.91 & 100.00 & \ditto & \ditto & 36.00 & 51.92 & \ditto & 52.07 \\ \midrule
 \sf DeepSeek-reasoner & Base & 7.46 & 22.22 & 30.00 & 81.82 & 0.00 & 29.98 & 4.50 & 26.44 \\
 & CtE & 36.36 & 100.00 & \ditto & \ditto & 37.50 & 55.45 & \ditto & 73.20 \\ \midrule
 \sf Gemini-2.5-pro & Base & 5.77 & 50.00 & 42.42 & 44.44 & 0.00 & 44.58 & 23.57 & 39.00 \\
 & CtE & 25.76 & 100.00 & \ditto & \ditto & 38.64 & 63.17 & \ditto & 60.09 \\ \midrule
 \sf GPT-5-mini & Base & 60.34 & 64.71 & 46.15 & 30.00 & 58.62 & 72.10 & 69.17 & 72.76 \\
 & CtE & 71.74 & 100.00 & \ditto & \ditto & 60.71 & 78.90 & \ditto & 77.00 \\ \midrule
 \sf GPT-o3 & Base & 16.28 & 70.59 & 28.57 & 67.86 & 8.11 & 43.24 & 16.46 & 39.48 \\
 & CtE & 32.50 & 86.36 & \ditto & \ditto & 41.38 & 68.66 & \ditto & 71.69 \\ \midrule
 \sf GPT-5 & Base & 50.88 & 64.29 & 60.00 & 15.79 & 51.52 & 71.02 & 71.31 & 64.70 \\
 & CtE & 56.25 & 100.00 & \ditto & \ditto & 62.07 & 78.48 & \ditto & 84.04 \\ \midrule
