A & \textsf{Llama-3.1-8b} & 5.83 & 11.33 & 11.50 & 19.17 & 2.83 & 10.13 \\
B & \textsf{GPT-oss-20b} & 93.83 & 92.33 & 96.33 & 91.33 & 94.00 & 93.56 \\
C & \textsf{GPT-4.1-nano} & 69.17 & 39.33 & 41.33 & 62.33 & 0.00 & 42.43 \\
D & \textsf{Mistral-small:24b} & 91.69 & 0.17 & 1.19 & 99.49 & 11.86 & 40.88 \\
E & \textsf{Llama-3.1-70b} & \textbf{97.33} & 7.67 & 27.00 & 99.17 & 98.50 & 65.93 \\
F & \textsf{Gemini-2.0-flash} & 90.83 & 79.17 & 21.33 & 95.17 & 80.33 & 73.37 \\
G & \textsf{GPT-4.1-mini} & 91.00 & 34.50 & 20.83 & 99.00 & 95.83 & 68.23 \\
H & \textsf{GPT-4o} & 94.50 & 31.67 & 12.17 & 99.33 & 65.33 & 60.60 \\
I & \textsf{GPT-4.1} & 94.50 & \textbf{98.67} & 69.67 & \textbf{99.50} & \textbf{99.33} & 92.33 \\
J & \textsf{Grok-3-mini} & 91.67 & 94.83 & 96.50 & 97.67 & 96.83 & 95.50 \\
K & \textsf{DeepSeek-V3.1} & 96.00 & 35.50 & 20.83 & 99.33 & 90.50 & 68.43 \\
L & \textsf{Gemini-2.5-flash} & 86.33 & 95.67 & 96.33 & 96.83 & 95.50 & 94.13 \\
M & \textsf{GPT-5-nano} & 91.83 & 85.00 & 91.50 & 88.50 & 91.17 & 89.60 \\
N & \textsf{DeepSeek-reasoner} & 88.83 & 97.00 & 96.67 & 96.33 & 94.33 & 94.63 \\
O & \textsf{Gemini-2.5-pro} & 91.33 & 97.67 & 94.50 & 98.50 & 96.17 & \textbf{95.63} \\
P & \textsf{GPT-5-mini} & 90.33 & 97.17 & 97.83 & 95.00 & 95.17 & 95.10 \\
Q & \textsf{GPT-o3} & 92.71 & 97.12 & 97.63 & 95.25 & 93.73 & 95.29 \\
R & \textsf{GPT-5} & 90.50 & 97.67 & \textbf{98.33} & 96.83 & 94.50 & 95.57 \\
\midrule
\multicolumn{2}{l !{\color{white!80!black} \vline width 1pt}}{\textit{Average}} & 86.01 & 66.25 & 60.64 & 90.48 & 77.55 & 76.19 \\