\sf Llama-3.1-8b  & Base & 93.83 & 86.33 & 87.33 & 28.50 & 97.17 \\
& CtE & 81.00 & 13.83 & \ditto & \ditto & 82.67 \\ \midrule
\sf GPT-oss-20b  & Base & 26.17 & 40.33 & 32.17 & 80.33 & 16.00 \\
& CtE & 66.50 & 89.33 & \ditto & \ditto & 40.33 \\ \midrule
\sf GPT-4.1-nano  & Base & 45.67 & 50.00 & 53.17 & 50.00 & 96.00 \\
& CtE & 69.33 & 36.50 & \ditto & \ditto & 100.00 \\ \midrule
\sf Mistral-small:24b  & Base & 48.98 & 49.49 & 54.58 & 50.68 & 87.29 \\
& CtE & 88.50 & 0.17 & \ditto & \ditto & 56.50 \\ \midrule
\sf Llama-3.1-70b  & Base & 23.00 & 68.17 & 62.17 & 71.17 & 4.33 \\
& CtE & 82.33 & 9.00 & \ditto & \ditto & 53.67 \\ \midrule
\sf Gemini-2.0-flash  & Base & 37.33 & 44.50 & 62.33 & 62.50 & 23.33 \\
& CtE & 88.50 & 46.83 & \ditto & \ditto & 46.17 \\ \midrule
\sf GPT-4.1-mini  & Base & 38.67 & 46.00 & 59.17 & 56.33 & 8.67 \\
& CtE & 89.33 & 44.83 & \ditto & \ditto & 62.50 \\ \midrule
\sf GPT-4o  & Base & 45.33 & 49.00 & 53.50 & 63.17 & 37.67 \\
& CtE & 96.50 & 66.33 & \ditto & \ditto & 73.50 \\ \midrule
\sf GPT-4.1  & Base & 42.83 & 49.83 & 50.33 & 63.83 & 10.83 \\
& CtE & 92.33 & 69.67 & \ditto & \ditto & 74.67 \\ \midrule
\sf Grok-3-mini  & Base & 39.33 & 52.17 & 44.00 & 88.00 & 26.17 \\
& CtE & 91.83 & 95.17 & \ditto & \ditto & 69.33 \\ \midrule
\sf DeepSeek-V3.1  & Base & 34.83 & 50.33 & 54.33 & 56.00 & 14.83 \\
& CtE & 89.33 & 10.00 & \ditto & \ditto & 52.00 \\ \midrule
\sf Gemini-2.5-flash  & Base & 41.83 & 50.83 & 41.83 & 84.00 & 25.67 \\
& CtE & 87.50 & 94.83 & \ditto & \ditto & 85.33 \\ \midrule
\sf GPT-5-nano  & Base & 57.83 & 61.33 & 62.67 & 65.83 & 24.50 \\
& CtE & 77.67 & 89.33 & \ditto & \ditto & 8.83 \\ \midrule
\sf DeepSeek-reasoner  & Base & 28.17 & 41.83 & 31.50 & 79.67 & 16.50 \\
& CtE & 65.33 & 92.00 & \ditto & \ditto & 54.00 \\ \midrule
\sf Gemini-2.5-pro  & Base & 38.67 & 43.67 & 39.67 & 81.67 & 23.33 \\
& CtE & 90.67 & 95.83 & \ditto & \ditto & 74.67 \\ \midrule
\sf GPT-5-mini  & Base & 63.33 & 63.00 & 68.33 & 63.50 & 19.17 \\
& CtE & 84.67 & 96.17 & \ditto & \ditto & 38.00 \\ \midrule
\sf GPT-o3  & Base & 39.32 & 49.49 & 39.66 & 84.92 & 25.42 \\
& CtE & 84.33 & 85.67 & \ditto & \ditto & 77.50 \\ \midrule
\sf GPT-5  & Base & 60.83 & 63.50 & 64.67 & 75.67 & 26.83 \\
& CtE & 91.33 & 98.17 & \ditto & \ditto & 63.00 \\ \midrule
