llm,R(1-2),R(1-3),R(1-4),R(3-4),R(1-34),mean_acc
llama3.1:8b,5.83,11.33,11.5,19.17,2.83,10.132
gpt-oss:20b,93.83,92.33,96.33,91.33,94.0,93.564
gpt-4.1-nano-2025-04-14,69.17,39.33,41.33,62.33,0.0,42.431999999999995
mistral-small:24b,91.69,0.16999999999999998,1.1900000000000002,99.49,11.86,40.879999999999995
llama3.1:70b,97.33000000000001,7.670000000000001,27.0,99.17,98.5,65.934
gemini-2.0-flash,90.83,79.17,21.33,95.17,80.33,73.366
gpt-4.1-mini-2025-04-14,91.0,34.5,20.830000000000002,99.0,95.83,68.232
gpt-4o,94.5,31.669999999999998,12.17,99.33,65.33,60.6
gpt-4.1-2025-04-14,94.5,98.67,69.67,99.5,99.33,92.334
grok-3-mini,91.67,94.83,96.5,97.67,96.83,95.5
deepseek-chat,96.0,35.5,20.830000000000002,99.33,90.5,68.432
gemini-2.5-flash,86.33,95.67,96.33,96.83,95.5,94.13199999999999
gpt-5-nano,91.83,85.0,91.5,88.5,91.17,89.6
deepseek-reasoner,88.83,97.0,96.67,96.33,94.33,94.63199999999999
gemini-2.5-pro,91.33,97.67,94.5,98.5,96.17,95.634
gpt-5-mini,90.33,97.17,97.83,95.0,95.17,95.1
o3,92.71000000000001,97.11999999999999,97.63,95.25,93.73,95.288
gpt-5,90.5,97.67,98.33,96.83,94.5,95.566
