 \sf Llama-3.1-8b & Base & 11.43 & 2.94 & 0.00 & 83.48 & 0.00 & 27.31 & 3.64 & 16.82 \\
 & CtE & 19.23 & 100.00 & \ditto & \ditto & 10.26 & 73.28 & \ditto & 50.77 \\ \midrule
 \sf GPT-oss-20b & Base & 22.20 & 38.27 & 31.66 & 84.67 & 15.60 & 44.83 & 6.45 & 40.29 \\
 & CtE & 67.84 & 100.00 & \ditto & \ditto & 48.26 & 79.69 & \ditto & 76.58 \\ \midrule
 \sf GPT-4.1-nano & Base & 30.84 & 44.49 & 38.31 & 59.63 & \ditto & 54.88 & 20.64 & \ditto \\
 & CtE & 96.51 & 66.23 & \ditto & \ditto & \ditto & 96.72 & \ditto & \ditto \\ \midrule
 \sf Mistral-small:24b & Base & 46.21 & 0.00 & 42.86 & 50.77 & 28.57 & 65.68 & 34.47 & 55.83 \\
 & CtE & 89.82 & 100.00 & \ditto & \ditto & 51.16 & 95.08 & \ditto & 81.67 \\ \midrule
 \sf Llama-3.1-70b & Base & 21.06 & 30.43 & 24.07 & 71.26 & 6.77 & 47.69 & 10.00 & 35.24 \\
 & CtE & 82.30 & 100.00 & \ditto & \ditto & 57.27 & 92.09 & \ditto & 86.26 \\ \midrule
 \sf Gemini-2.0-flash & Base & 33.21 & 41.03 & 44.53 & 63.46 & 11.60 & 60.45 & 13.01 & 46.20 \\
 & CtE & 89.25 & 100.00 & \ditto & \ditto & 84.55 & 94.75 & \ditto & 93.57 \\ \midrule
 \sf GPT-4.1-mini & Base & 34.80 & 41.55 & 44.00 & 56.40 & 13.39 & 64.19 & 15.52 & 50.27 \\
 & CtE & 93.42 & 65.32 & \ditto & \ditto & 87.39 & 97.17 & \ditto & 95.15 \\ \midrule
 \sf GPT-4o & Base & 44.97 & 53.68 & 52.05 & 63.09 & 23.79 & 65.84 & 26.36 & 55.03 \\
 & CtE & 98.28 & 99.25 & \ditto & \ditto & 92.57 & 59.12 & \ditto & 98.35 \\ \midrule
 \sf GPT-4.1 & Base & 40.74 & 49.83 & 42.11 & 63.65 & 16.44 & 69.17 & 10.90 & 54.46 \\
 & CtE & 94.53 & 70.34 & \ditto & \ditto & 87.14 & 98.65 & \ditto & 97.24 \\ \midrule
 \sf Grok-3-mini & Base & 35.64 & 52.37 & 43.70 & 88.74 & 26.85 & 64.90 & 5.04 & 57.93 \\
 & CtE & 95.42 & 100.00 & \ditto & \ditto & 79.52 & 98.67 & \ditto & 93.89 \\ \midrule
 \sf DeepSeek-V3.1 & Base & 32.99 & 44.13 & 36.80 & 56.04 & 15.84 & 57.66 & 16.63 & 46.73 \\
 & CtE & 92.81 & 100.00 & \ditto & \ditto & 85.71 & 95.98 & \ditto & 96.43 \\ \midrule
 \sf Gemini-2.5-flash & Base & 35.71 & 50.17 & 41.35 & 85.37 & 25.31 & 62.83 & 3.01 & 54.82 \\
 & CtE & 94.79 & 100.00 & \ditto & \ditto & 94.57 & 95.60 & \ditto & 96.53 \\ \midrule
 \sf GPT-5-nano & Base & 59.17 & 64.90 & 64.12 & 68.93 & 43.33 & 72.58 & 28.56 & 62.84 \\
 & CtE & 80.55 & 100.00 & \ditto & \ditto & 72.00 & 80.81 & \ditto & 78.26 \\ \midrule
 \sf DeepSeek-reasoner & Base & 20.08 & 40.72 & 30.17 & 82.01 & 14.31 & 48.13 & 3.94 & 41.90 \\
 & CtE & 65.47 & 100.00 & \ditto & \ditto & 60.56 & 67.65 & \ditto & 75.33 \\ \midrule
 \sf Gemini-2.5-pro & Base & 33.39 & 43.52 & 38.62 & 82.06 & 23.22 & 64.90 & 2.95 & 57.36 \\
 & CtE & 92.70 & 100.00 & \ditto & \ditto & 79.50 & 93.07 & \ditto & 88.22 \\ \midrule
 \sf GPT-5-mini & Base & 65.87 & 63.81 & 68.65 & 63.16 & 47.29 & 77.73 & 34.18 & 66.80 \\
 & CtE & 89.35 & 100.00 & \ditto & \ditto & 64.34 & 93.71 & \ditto & 76.66 \\ \midrule
 \sf GPT-o3 & Base & 35.83 & 50.09 & 38.89 & 87.54 & 23.87 & 63.97 & 4.98 & 55.25 \\
 & CtE & 85.54 & 88.41 & \ditto & \ditto & 83.01 & 94.99 & \ditto & 92.23 \\ \midrule
 \sf GPT-5 & Base & 62.06 & 64.16 & 65.08 & 75.39 & 42.68 & 78.91 & 20.50 & 68.34 \\
 & CtE & 95.47 & 100.00 & \ditto & \ditto & 89.49 & 98.21 & \ditto & 93.82 \\ \midrule
