,judge,solver,solver_id,problem,competition,true_grade,split,cost,confidence,incorrect
0,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_1,allrussian,incorrect,generic,0.0,,True
1,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_10,allrussian,incorrect,generic,0.0,,True
2,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_11,allrussian,detected,generic,0.0,,False
3,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_12,allrussian,incorrect,generic,0.0,,True
4,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_13,allrussian,detected,generic,0.0,,False
5,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_14,allrussian,incorrect,generic,0.0,,True
6,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_15,allrussian,incorrect,generic,0.0,,True
7,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_16,allrussian,incorrect,generic,0.0,,True
8,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_2,allrussian,incorrect,generic,0.0,,True
9,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_3,allrussian,incorrect,generic,0.0,,True
10,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_4,allrussian,incorrect,generic,0.0,,True
11,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_5,allrussian,correct,generic,0.0,,False
12,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_6,allrussian,incorrect,generic,0.0,,True
13,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_7,allrussian,incorrect,generic,0.0,,True
14,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_8,allrussian,incorrect,generic,0.0,,True
15,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,allrussian_2025_9,allrussian,correct,generic,0.0,,False
16,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmo_2025_1,bmo,incorrect,generic,0.0,,True
17,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmo_2025_2,bmo,incorrect,generic,0.0,,True
18,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmo_2025_3,bmo,corrected,generic,0.0,,False
19,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmo_2025_4,bmo,incorrect,generic,0.0,,True
20,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_1,bmosl,detected,generic,0.0,,False
21,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_2,bmosl,incorrect,generic,0.0,,True
22,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_3,bmosl,incorrect,generic,0.0,,True
23,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_4,bmosl,incorrect,generic,0.0,,True
24,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_5,bmosl,detected,generic,0.0,,False
25,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_A_2025_6,bmosl,detected,generic,0.0,,False
26,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_1,bmosl,incorrect,generic,0.0,,True
27,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_2,bmosl,detected,generic,0.0,,False
28,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_3,bmosl,incorrect,generic,0.0,,True
29,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_4,bmosl,incorrect,generic,0.0,,True
30,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_5,bmosl,incorrect,generic,0.0,,True
31,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_C_2025_6,bmosl,incorrect,generic,0.0,,True
32,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_1,bmosl,incorrect,generic,0.0,,True
33,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_2,bmosl,incorrect,generic,0.0,,True
34,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_3,bmosl,incorrect,generic,0.0,,True
35,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_4,bmosl,incorrect,generic,0.0,,True
36,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_5,bmosl,incorrect,generic,0.0,,True
37,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_6,bmosl,incorrect,generic,0.0,,True
38,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_G_2025_7,bmosl,incorrect,generic,0.0,,True
39,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_1,bmosl,incorrect,generic,0.0,,True
40,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_2,bmosl,detected,generic,0.0,,False
41,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_3,bmosl,incorrect,generic,0.0,,True
42,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_4,bmosl,incorrect,generic,0.0,,True
43,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_5,bmosl,detected,generic,0.0,,False
44,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_6,bmosl,incorrect,generic,0.0,,True
45,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bmosl_NT_2025_7,bmosl,detected,generic,0.0,,False
46,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_1,bulgaria,incorrect,generic,0.0,,True
47,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_2,bulgaria,detected,generic,0.0,,False
48,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_3,bulgaria,detected,generic,0.0,,False
49,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_4,bulgaria,incorrect,generic,0.0,,True
50,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_5,bulgaria,detected,generic,0.0,,False
51,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,bulgaria_2025_6,bulgaria,detected,generic,0.0,,False
52,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,canada_2025_1,canada,detected,generic,0.0,,False
53,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,canada_2025_2,canada,detected,generic,0.0,,False
54,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,canada_2025_3,canada,incorrect,generic,0.0,,True
55,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,canada_2025_4,canada,incorrect,generic,0.0,,True
56,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,canada_2025_5,canada,incorrect,generic,0.0,,True
57,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,china_2025_1,china,detected,generic,0.0,,False
58,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,china_2025_2,china,incorrect,generic,0.0,,True
59,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,china_2025_3,china,incorrect,generic,0.0,,True
60,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,china_2025_5,china,incorrect,generic,0.0,,True
61,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,china_2025_6,china,incorrect,generic,0.0,,True
62,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_1,chinatst,incorrect,generic,0.0,,True
63,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_10,chinatst,incorrect,generic,0.0,,True
64,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_11,chinatst,correct,generic,0.0,,False
65,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_12,chinatst,incorrect,generic,0.0,,True
66,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_13,chinatst,incorrect,generic,0.0,,True
67,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_14,chinatst,detected,generic,0.0,,False
68,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_15,chinatst,incorrect,generic,0.0,,True
69,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_16,chinatst,incorrect,generic,0.0,,True
70,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_17,chinatst,incorrect,generic,0.0,,True
71,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_18,chinatst,incorrect,generic,0.0,,True
72,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_19,chinatst,incorrect,generic,0.0,,True
73,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_2,chinatst,incorrect,generic,0.0,,True
74,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_20,chinatst,detected,generic,0.0,,False
75,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_21,chinatst,incorrect,generic,0.0,,True
76,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_22,chinatst,incorrect,generic,0.0,,True
77,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_3,chinatst,detected,generic,0.0,,False
78,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_4,chinatst,incorrect,generic,0.0,,True
79,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_5,chinatst,incorrect,generic,0.0,,True
80,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_6,chinatst,correct,generic,0.0,,False
81,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_7,chinatst,incorrect,generic,0.0,,True
82,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_8,chinatst,incorrect,generic,0.0,,True
83,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,chinatst_2025_9,chinatst,detected,generic,0.0,,False
84,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_1,egmo,correct,generic,0.0,,False
85,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_2,egmo,incorrect,generic,0.0,,True
86,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_3,egmo,incorrect,generic,0.0,,True
87,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_4,egmo,incorrect,generic,0.0,,True
88,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_5,egmo,incorrect,generic,0.0,,True
89,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,egmo_2025_6,egmo,incorrect,generic,0.0,,True
90,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_1,elmosl,incorrect,generic,0.0,,True
91,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_2,elmosl,detected,generic,0.0,,False
92,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_3,elmosl,incorrect,generic,0.0,,True
93,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_5,elmosl,incorrect,generic,0.0,,True
94,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_6,elmosl,correct,generic,0.0,,False
95,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_7,elmosl,incorrect,generic,0.0,,True
96,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_A_2025_8,elmosl,incorrect,generic,0.0,,True
97,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_1,elmosl,incorrect,generic,0.0,,True
98,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_2,elmosl,incorrect,generic,0.0,,True
99,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_3,elmosl,corrected,generic,0.0,,False
100,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_4,elmosl,incorrect,generic,0.0,,True
101,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_5,elmosl,incorrect,generic,0.0,,True
102,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_6,elmosl,incorrect,generic,0.0,,True
103,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_7,elmosl,incorrect,generic,0.0,,True
104,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_8,elmosl,detected,generic,0.0,,False
105,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_C_2025_9,elmosl,incorrect,generic,0.0,,True
106,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_1,elmosl,incorrect,generic,0.0,,True
107,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_2,elmosl,incorrect,generic,0.0,,True
108,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_3,elmosl,incorrect,generic,0.0,,True
109,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_4,elmosl,incorrect,generic,0.0,,True
110,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_5,elmosl,corrected,generic,0.0,,False
111,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_6,elmosl,incorrect,generic,0.0,,True
112,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_7,elmosl,detected,generic,0.0,,False
113,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_8,elmosl,incorrect,generic,0.0,,True
114,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_G_2025_9,elmosl,incorrect,generic,0.0,,True
115,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_1,elmosl,incorrect,generic,0.0,,True
116,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0,,True
117,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_3,elmosl,detected,generic,0.0,,False
118,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_4,elmosl,incorrect,generic,0.0,,True
119,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_5,elmosl,detected,generic,0.0,,False
120,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_6,elmosl,incorrect,generic,0.0,,True
121,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,elmosl_NT_2025_7,elmosl,detected,generic,0.0,,False
122,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,german_2025_1,german,correct,generic,0.0,,False
123,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,german_2025_2,german,incorrect,generic,0.0,,True
124,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,german_2025_3,german,detected,generic,0.0,,False
125,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,german_2025_4,german,correct,generic,0.0,,False
126,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,greece_2025_1,greece,correct,generic,0.0,,False
127,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,greece_2025_2,greece,incorrect,generic,0.0,,True
128,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,greece_2025_3,greece,incorrect,generic,0.0,,True
129,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,greece_2025_4,greece,detected,generic,0.0,,False
130,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_1,imosl,corrected,generic,0.0,,False
131,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_10,imosl,incorrect,generic,0.0,,True
132,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_11,imosl,incorrect,generic,0.0,,True
133,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_12,imosl,detected,generic,0.0,,False
134,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_13,imosl,detected,generic,0.0,,False
135,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_14,imosl,incorrect,generic,0.0,,True
136,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_15,imosl,incorrect,generic,0.0,,True
137,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_16,imosl,incorrect,generic,0.0,,True
138,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_17,imosl,incorrect,generic,0.0,,True
139,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_18,imosl,incorrect,generic,0.0,,True
140,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_19,imosl,incorrect,generic,0.0,,True
141,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_2,imosl,corrected,generic,0.0,,False
142,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_20,imosl,incorrect,generic,0.0,,True
143,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_21,imosl,incorrect,generic,0.0,,True
144,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_22,imosl,incorrect,generic,0.0,,True
145,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_23,imosl,incorrect,generic,0.0,,True
146,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_24,imosl,incorrect,generic,0.0,,True
147,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_25,imosl,correct,generic,0.0,,False
148,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_26,imosl,correct,generic,0.0,,False
149,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_27,imosl,correct,generic,0.0,,False
150,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_28,imosl,incorrect,generic,0.0,,True
151,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_29,imosl,incorrect,generic,0.0,,True
152,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_3,imosl,detected,generic,0.0,,False
153,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_30,imosl,incorrect,generic,0.0,,True
154,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_31,imosl,detected,generic,0.0,,False
155,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_32,imosl,detected,generic,0.0,,False
156,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_33,imosl,detected,generic,0.0,,False
157,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_34,imosl,incorrect,generic,0.0,,True
158,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_35,imosl,incorrect,generic,0.0,,True
159,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_36,imosl,detected,generic,0.0,,False
160,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_37,imosl,incorrect,generic,0.0,,True
161,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_4,imosl,incorrect,generic,0.0,,True
162,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_5,imosl,corrected,generic,0.0,,False
163,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_6,imosl,incorrect,generic,0.0,,True
164,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_7,imosl,detected,generic,0.0,,False
165,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_8,imosl,incorrect,generic,0.0,,True
166,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,imosl_2025_9,imosl,correct,generic,0.0,,False
167,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_1,india,correct,generic,0.0,,False
168,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_2,india,incorrect,generic,0.0,,True
169,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_3,india,incorrect,generic,0.0,,True
170,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_4,india,detected,generic,0.0,,False
171,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_5,india,incorrect,generic,0.0,,True
172,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_2025_6,india,incorrect,generic,0.0,,True
173,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_1,india,detected,generic,0.0,,False
174,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_10,india,incorrect,generic,0.0,,True
175,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_11,india,incorrect,generic,0.0,,True
176,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_12,india,incorrect,generic,0.0,,True
177,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_13,india,incorrect,generic,0.0,,True
178,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_14,india,detected,generic,0.0,,False
179,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_15,india,detected,generic,0.0,,False
180,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_16,india,corrected,generic,0.0,,False
181,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_17,india,incorrect,generic,0.0,,True
182,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_18,india,incorrect,generic,0.0,,True
183,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_19,india,incorrect,generic,0.0,,True
184,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_2,india,incorrect,generic,0.0,,True
185,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_20,india,incorrect,generic,0.0,,True
186,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_21,india,detected,generic,0.0,,False
187,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_3,india,detected,generic,0.0,,False
188,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_4,india,incorrect,generic,0.0,,True
189,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_5,india,incorrect,generic,0.0,,True
190,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_6,india,incorrect,generic,0.0,,True
191,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_7,india,incorrect,generic,0.0,,True
192,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_8,india,detected,generic,0.0,,False
193,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,india_prep_2025_9,india,detected,generic,0.0,,False
194,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_1,iran,detected,generic,0.0,,False
195,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_2,iran,incorrect,generic,0.0,,True
196,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_3,iran,incorrect,generic,0.0,,True
197,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_4,iran,incorrect,generic,0.0,,True
198,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_5,iran,incorrect,generic,0.0,,True
199,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_6,iran,incorrect,generic,0.0,,True
200,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_7,iran,incorrect,generic,0.0,,True
201,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_8,iran,incorrect,generic,0.0,,True
202,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,iran_tst_2025_9,iran,incorrect,generic,0.0,,True
203,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_1,israel,correct,generic,0.0,,False
204,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_10,israel,incorrect,generic,0.0,,True
205,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_2,israel,incorrect,generic,0.0,,True
206,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_3,israel,detected,generic,0.0,,False
207,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_4,israel,incorrect,generic,0.0,,True
208,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_5,israel,detected,generic,0.0,,False
209,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_6,israel,incorrect,generic,0.0,,True
210,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_7,israel,detected,generic,0.0,,False
211,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_8,israel,incorrect,generic,0.0,,True
212,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,israel_tst_2025_9,israel,incorrect,generic,0.0,,True
213,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_1,izho,incorrect,generic,0.0,,True
214,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_2,izho,incorrect,generic,0.0,,True
215,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_3,izho,incorrect,generic,0.0,,True
216,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_4,izho,incorrect,generic,0.0,,True
217,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_5,izho,incorrect,generic,0.0,,True
218,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,izho_2025_6,izho,detected,generic,0.0,,False
219,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,jbmo_2025_1,jbmo,detected,generic,0.0,,False
220,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,jbmo_2025_2,jbmo,corrected,generic,0.0,,False
221,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,jbmo_2025_3,jbmo,detected,generic,0.0,,False
222,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,jbmo_2025_4,jbmo,detected,generic,0.0,,False
223,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_1,korea,correct,generic,0.0,,False
224,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_2,korea,incorrect,generic,0.0,,True
225,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_3,korea,incorrect,generic,0.0,,True
226,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_4,korea,incorrect,generic,0.0,,True
227,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_5,korea,incorrect,generic,0.0,,True
228,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,korea_2025_6,korea,incorrect,generic,0.0,,True
229,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_1,matharena,correct,matharena,0.0,,False
230,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_10,matharena,incorrect,matharena,0.0,,True
231,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_11,matharena,incorrect,matharena,0.0,,True
232,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_12,matharena,incorrect,matharena,0.0,,True
233,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_13,matharena,incorrect,matharena,0.0,,True
234,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_14,matharena,incorrect,matharena,0.0,,True
235,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_15,matharena,incorrect,matharena,0.0,,True
236,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_16,matharena,correct,matharena,0.0,,False
237,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_17,matharena,incorrect,matharena,0.0,,True
238,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_18,matharena,corrected,matharena,0.0,,False
239,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_19,matharena,corrected,matharena,0.0,,False
240,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_2,matharena,incorrect,matharena,0.0,,True
241,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_20,matharena,detected,matharena,0.0,,False
242,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_21,matharena,correct,matharena,0.0,,False
243,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_22,matharena,correct,matharena,0.0,,False
244,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_23,matharena,incorrect,matharena,0.0,,True
245,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_24,matharena,correct,matharena,0.0,,False
246,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_25,matharena,correct,matharena,0.0,,False
247,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_26,matharena,incorrect,matharena,0.0,,True
248,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_27,matharena,detected,matharena,0.0,,False
249,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_28,matharena,incorrect,matharena,0.0,,True
250,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_29,matharena,detected,matharena,0.0,,False
251,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_3,matharena,correct,matharena,0.0,,False
252,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_30,matharena,incorrect,matharena,0.0,,True
253,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_4,matharena,incorrect,matharena,0.0,,True
254,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_5,matharena,correct,matharena,0.0,,False
255,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_6,matharena,correct,matharena,0.0,,False
256,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_7,matharena,corrected,matharena,0.0,,False
257,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_8,matharena,corrected,matharena,0.0,,False
258,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_aime_aime_2025_9,matharena,incorrect,matharena,0.0,,True
259,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_1,matharena,correct,matharena,0.0,,False
260,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_10,matharena,corrected,matharena,0.0,,False
261,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_11,matharena,correct,matharena,0.0,,False
262,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_12,matharena,correct,matharena,0.0,,False
263,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_13,matharena,incorrect,matharena,0.0,,True
264,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_14,matharena,correct,matharena,0.0,,False
265,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_15,matharena,detected,matharena,0.0,,False
266,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.0,,False
267,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_17,matharena,correct,matharena,0.0,,False
268,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_18,matharena,correct,matharena,0.0,,False
269,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_19,matharena,correct,matharena,0.0,,False
270,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_2,matharena,correct,matharena,0.0,,False
271,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_20,matharena,correct,matharena,0.0,,False
272,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_21,matharena,correct,matharena,0.0,,False
273,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.0,,True
274,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_23,matharena,corrected,matharena,0.0,,False
275,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_24,matharena,correct,matharena,0.0,,False
276,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_25,matharena,correct,matharena,0.0,,False
277,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_26,matharena,correct,matharena,0.0,,False
278,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_27,matharena,correct,matharena,0.0,,False
279,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_28,matharena,detected,matharena,0.0,,False
280,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_29,matharena,corrected,matharena,0.0,,False
281,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_3,matharena,correct,matharena,0.0,,False
282,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_30,matharena,incorrect,matharena,0.0,,True
283,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_4,matharena,correct,matharena,0.0,,False
284,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_5,matharena,corrected,matharena,0.0,,False
285,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_6,matharena,correct,matharena,0.0,,False
286,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.0,,False
287,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_8,matharena,correct,matharena,0.0,,False
288,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_brumo_brumo_2025_9,matharena,correct,matharena,0.0,,False
289,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_1,matharena,correct,matharena,0.0,,False
290,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_10,matharena,correct,matharena,0.0,,False
291,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_11,matharena,incorrect,matharena,0.0,,True
292,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_12,matharena,correct,matharena,0.0,,False
293,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_13,matharena,corrected,matharena,0.0,,False
294,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_14,matharena,detected,matharena,0.0,,False
295,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_15,matharena,incorrect,matharena,0.0,,True
296,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_16,matharena,incorrect,matharena,0.0,,True
297,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_17,matharena,correct,matharena,0.0,,False
298,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.0,,True
299,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_19,matharena,incorrect,matharena,0.0,,True
300,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_2,matharena,incorrect,matharena,0.0,,True
301,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_20,matharena,incorrect,matharena,0.0,,True
302,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_21,matharena,incorrect,matharena,0.0,,True
303,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_22,matharena,correct,matharena,0.0,,False
304,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_23,matharena,correct,matharena,0.0,,False
305,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_24,matharena,correct,matharena,0.0,,False
306,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_25,matharena,incorrect,matharena,0.0,,True
307,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_26,matharena,correct,matharena,0.0,,False
308,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_27,matharena,detected,matharena,0.0,,False
309,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_28,matharena,incorrect,matharena,0.0,,True
310,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_29,matharena,incorrect,matharena,0.0,,True
311,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_3,matharena,corrected,matharena,0.0,,False
312,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.0,,True
313,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_31,matharena,correct,matharena,0.0,,False
314,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0,,True
315,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_33,matharena,correct,matharena,0.0,,False
316,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_34,matharena,detected,matharena,0.0,,False
317,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_35,matharena,detected,matharena,0.0,,False
318,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_36,matharena,incorrect,matharena,0.0,,True
319,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_37,matharena,detected,matharena,0.0,,False
320,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_38,matharena,detected,matharena,0.0,,False
321,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_39,matharena,correct,matharena,0.0,,False
322,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_4,matharena,correct,matharena,0.0,,False
323,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_40,matharena,incorrect,matharena,0.0,,True
324,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_5,matharena,detected,matharena,0.0,,False
325,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_6,matharena,correct,matharena,0.0,,False
326,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_7,matharena,incorrect,matharena,0.0,,True
327,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_8,matharena,incorrect,matharena,0.0,,True
328,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_cmimc_cmimc_2025_9,matharena,incorrect,matharena,0.0,,True
329,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_1,matharena,correct,matharena,0.0,,False
330,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.0,,True
331,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_11,matharena,detected,matharena,0.0,,False
332,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_12,matharena,correct,matharena,0.0,,False
333,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_13,matharena,detected,matharena,0.0,,False
334,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_14,matharena,incorrect,matharena,0.0,,True
335,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_15,matharena,correct,matharena,0.0,,False
336,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_16,matharena,detected,matharena,0.0,,False
337,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.0,,True
338,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_18,matharena,incorrect,matharena,0.0,,True
339,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.0,,True
340,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_2,matharena,corrected,matharena,0.0,,False
341,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.0,,True
342,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.0,,False
343,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_22,matharena,correct,matharena,0.0,,False
344,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0,,False
345,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_24,matharena,corrected,matharena,0.0,,False
346,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_25,matharena,incorrect,matharena,0.0,,True
347,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_26,matharena,correct,matharena,0.0,,False
348,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_27,matharena,incorrect,matharena,0.0,,True
349,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_28,matharena,detected,matharena,0.0,,False
350,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_29,matharena,detected,matharena,0.0,,False
351,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_3,matharena,incorrect,matharena,0.0,,True
352,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_30,matharena,detected,matharena,0.0,,False
353,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_4,matharena,correct,matharena,0.0,,False
354,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_5,matharena,corrected,matharena,0.0,,False
355,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_6,matharena,correct,matharena,0.0,,False
356,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_7,matharena,correct,matharena,0.0,,False
357,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_8,matharena,correct,matharena,0.0,,False
358,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_hmmt_hmmt_feb_2025_9,matharena,corrected,matharena,0.0,,False
359,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_1,matharena,correct,matharena,0.0,,False
360,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_10,matharena,detected,matharena,0.0,,False
361,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_11,matharena,detected,matharena,0.0,,False
362,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_12,matharena,correct,matharena,0.0,,False
363,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_13,matharena,incorrect,matharena,0.0,,True
364,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_14,matharena,correct,matharena,0.0,,False
365,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_15,matharena,correct,matharena,0.0,,False
366,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_16,matharena,correct,matharena,0.0,,False
367,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_17,matharena,correct,matharena,0.0,,False
368,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_18,matharena,detected,matharena,0.0,,False
369,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_19,matharena,correct,matharena,0.0,,False
370,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_2,matharena,corrected,matharena,0.0,,False
371,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.0,,True
372,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_21,matharena,detected,matharena,0.0,,False
373,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.0,,True
374,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_23,matharena,corrected,matharena,0.0,,False
375,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_24,matharena,detected,matharena,0.0,,False
376,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_25,matharena,incorrect,matharena,0.0,,True
377,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_26,matharena,correct,matharena,0.0,,False
378,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.0,,True
379,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_28,matharena,correct,matharena,0.0,,False
380,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_29,matharena,correct,matharena,0.0,,False
381,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_3,matharena,correct,matharena,0.0,,False
382,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_30,matharena,incorrect,matharena,0.0,,True
383,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_31,matharena,correct,matharena,0.0,,False
384,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_32,matharena,correct,matharena,0.0,,False
385,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_33,matharena,incorrect,matharena,0.0,,True
386,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.0,,True
387,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_35,matharena,correct,matharena,0.0,,False
388,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_36,matharena,correct,matharena,0.0,,False
389,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_37,matharena,corrected,matharena,0.0,,False
390,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_38,matharena,correct,matharena,0.0,,False
391,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_39,matharena,correct,matharena,0.0,,False
392,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_4,matharena,correct,matharena,0.0,,False
393,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_40,matharena,correct,matharena,0.0,,False
394,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_41,matharena,incorrect,matharena,0.0,,True
395,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.0,,True
396,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.0,,True
397,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_44,matharena,correct,matharena,0.0,,False
398,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_45,matharena,correct,matharena,0.0,,False
399,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_46,matharena,correct,matharena,0.0,,False
400,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_47,matharena,correct,matharena,0.0,,False
401,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_48,matharena,correct,matharena,0.0,,False
402,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_49,matharena,correct,matharena,0.0,,False
403,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_5,matharena,corrected,matharena,0.0,,False
404,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_50,matharena,correct,matharena,0.0,,False
405,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_51,matharena,correct,matharena,0.0,,False
406,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_52,matharena,detected,matharena,0.0,,False
407,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_53,matharena,correct,matharena,0.0,,False
408,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_6,matharena,detected,matharena,0.0,,False
409,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_7,matharena,correct,matharena,0.0,,False
410,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_8,matharena,detected,matharena,0.0,,False
411,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,matharena_smt_smt_2025_9,matharena,corrected,matharena,0.0,,False
412,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,nordic_2025_1,nordic,correct,generic,0.0,,False
413,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,nordic_2025_2,nordic,detected,generic,0.0,,False
414,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,nordic_2025_3,nordic,detected,generic,0.0,,False
415,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_1,pan,incorrect,generic,0.0,,True
416,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_2,pan,correct,generic,0.0,,False
417,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_3,pan,corrected,generic,0.0,,False
418,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_4,pan,corrected,generic,0.0,,False
419,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_5,pan,incorrect,generic,0.0,,True
420,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,pan_african_2025_6,pan,incorrect,generic,0.0,,True
421,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_1,philippines,correct,generic,0.0,,False
422,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_2,philippines,incorrect,generic,0.0,,True
423,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_3,philippines,detected,generic,0.0,,False
424,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_4,philippines,incorrect,generic,0.0,,True
425,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_5,philippines,detected,generic,0.0,,False
426,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_6,philippines,incorrect,generic,0.0,,True
427,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_7,philippines,incorrect,generic,0.0,,True
428,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,philippines_2025_8,philippines,incorrect,generic,0.0,,True
429,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_1,polish,correct,generic,0.0,,False
430,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_2,polish,incorrect,generic,0.0,,True
431,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_3,polish,incorrect,generic,0.0,,True
432,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_4,polish,detected,generic,0.0,,False
433,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_5,polish,incorrect,generic,0.0,,True
434,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,polish_2025_6,polish,detected,generic,0.0,,False
435,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_1,rmm,incorrect,generic,0.0,,True
436,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_2,rmm,detected,generic,0.0,,False
437,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_3,rmm,incorrect,generic,0.0,,True
438,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_4,rmm,incorrect,generic,0.0,,True
439,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_5,rmm,incorrect,generic,0.0,,True
440,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,rmm_2025_6,rmm,corrected,generic,0.0,,False
441,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_10_2025_1,romania,incorrect,generic,0.0,,True
442,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_10_2025_2,romania,incorrect,generic,0.0,,True
443,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_10_2025_3,romania,incorrect,generic,0.0,,True
444,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_11_2025_1,romania,correct,generic,0.0,,False
445,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_11_2025_2,romania,correct,generic,0.0,,False
446,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_11_2025_3,romania,incorrect,generic,0.0,,True
447,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_12_2025_1,romania,detected,generic,0.0,,False
448,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_12_2025_2,romania,correct,generic,0.0,,False
449,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_12_2025_3,romania,incorrect,generic,0.0,,True
450,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_9_2025_1,romania,incorrect,generic,0.0,,True
451,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_9_2025_2,romania,detected,generic,0.0,,False
452,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_tst_2025_1,romania,incorrect,generic,0.0,,True
453,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_tst_2025_2,romania,incorrect,generic,0.0,,True
454,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,romania_tst_2025_3,romania,detected,generic,0.0,,False
455,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,serbia_tst_bmo_2025_1,serbia,detected,generic,0.0,,False
456,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,serbia_tst_bmo_2025_2,serbia,correct,generic,0.0,,False
457,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,serbia_tst_bmo_2025_3,serbia,detected,generic,0.0,,False
458,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,serbia_tst_bmo_2025_4,serbia,incorrect,generic,0.0,,True
459,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,spain_2025_1,spain,correct,generic,0.0,,False
460,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,spain_2025_2,spain,detected,generic,0.0,,False
461,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,spain_2025_3,spain,incorrect,generic,0.0,,True
462,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,spain_2025_4,spain,incorrect,generic,0.0,,True
463,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,spain_2025_5,spain,detected,generic,0.0,,False
464,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_1,thai,correct,generic,0.0,,False
465,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_10,thai,incorrect,generic,0.0,,True
466,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_2,thai,detected,generic,0.0,,False
467,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_3,thai,detected,generic,0.0,,False
468,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_4,thai,incorrect,generic,0.0,,True
469,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_5,thai,detected,generic,0.0,,False
470,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_6,thai,incorrect,generic,0.0,,True
471,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_7,thai,incorrect,generic,0.0,,True
472,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_8,thai,incorrect,generic,0.0,,True
473,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,thai_2025_9,thai,detected,generic,0.0,,False
474,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_1,turkey,detected,generic,0.0,,False
475,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_2,turkey,incorrect,generic,0.0,,True
476,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_3,turkey,incorrect,generic,0.0,,True
477,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_4,turkey,incorrect,generic,0.0,,True
478,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_5,turkey,incorrect,generic,0.0,,True
479,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_6,turkey,incorrect,generic,0.0,,True
480,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_7,turkey,incorrect,generic,0.0,,True
481,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_8,turkey,detected,generic,0.0,,False
482,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,turkey_tst_2025_9,turkey,incorrect,generic,0.0,,True
483,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_1,usamo,incorrect,generic,0.0,,True
484,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_2,usamo,detected,generic,0.0,,False
485,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_3,usamo,incorrect,generic,0.0,,True
486,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_4,usamo,incorrect,generic,0.0,,True
487,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_5,usamo,incorrect,generic,0.0,,True
488,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usamo_2025_6,usamo,incorrect,generic,0.0,,True
489,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_1,usatst,detected,generic,0.0,,False
490,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_2,usatst,incorrect,generic,0.0,,True
491,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_3,usatst,incorrect,generic,0.0,,True
492,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_4,usatst,detected,generic,0.0,,False
493,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_5,usatst,incorrect,generic,0.0,,True
494,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_6,usatst,incorrect,generic,0.0,,True
495,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_7,usatst,detected,generic,0.0,,False
496,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_8,usatst,incorrect,generic,0.0,,True
497,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,usatst_2025_9,usatst,incorrect,generic,0.0,,True
498,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_1,vietnam,correct,generic,0.0,,False
499,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_2,vietnam,incorrect,generic,0.0,,True
500,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_3,vietnam,incorrect,generic,0.0,,True
501,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_4,vietnam,incorrect,generic,0.0,,True
502,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_5,vietnam,detected,generic,0.0,,False
503,GPT-5-mini (medium),o4-mini (high),openai/o4-mini--high,vietnam_2025_6,vietnam,detected,generic,0.0,,False
504,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_11,allrussian,correct,generic,0.0120933,,False
505,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_14,allrussian,correct,generic,0.0079215,,False
506,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_10,allrussian,incorrect,generic,0.0107748,,True
507,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_13,allrussian,detected,generic,0.0026289,,False
508,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_16,allrussian,corrected,generic,0.010269,,False
509,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_1,allrussian,incorrect,generic,0.02859405,,True
510,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_4,allrussian,incorrect,generic,0.02085675,,True
511,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_2,allrussian,incorrect,generic,0.0330571499999999,,True
512,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_12,allrussian,detected,generic,0.0294285,,False
513,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_5,allrussian,correct,generic,0.0123729,,False
514,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_3,allrussian,correct,generic,0.0340317,,False
515,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_7,allrussian,correct,generic,0.01993425,,False
516,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_15,allrussian,corrected,generic,0.0142199999999999,,False
517,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_6,allrussian,incorrect,generic,0.02463285,,True
518,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_8,allrussian,incorrect,generic,0.0326265,,True
519,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,allrussian_2025_9,allrussian,correct,generic,0.00777105,,False
520,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmo_2025_2,bmo,incorrect,generic,0.05568285,,True
521,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmo_2025_3,bmo,corrected,generic,0.026316,,False
522,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmo_2025_4,bmo,incorrect,generic,0.01830585,,True
523,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_3,bmosl,correct,generic,0.0100596,,False
524,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_1,bmosl,correct,generic,0.0063038999999999,,False
525,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_2,bmosl,detected,generic,0.0149505,,False
526,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_4,bmosl,detected,generic,0.00564765,,False
527,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_6,bmosl,incorrect,generic,0.0246526499999999,,True
528,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_2,bmosl,detected,generic,0.00437685,,False
529,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmo_2025_1,bmo,incorrect,generic,0.04347375,,True
530,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_4,bmosl,incorrect,generic,0.004365,,True
531,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_5,bmosl,incorrect,generic,0.0311116499999999,,True
532,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_3,bmosl,incorrect,generic,0.0336966,,True
533,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_1,bmosl,incorrect,generic,0.0282153,,True
534,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_3,bmosl,correct,generic,0.0210349499999999,,False
535,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_1,bmosl,incorrect,generic,0.0208888499999999,,True
536,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_A_2025_5,bmosl,correct,generic,0.03349185,,False
537,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_4,bmosl,incorrect,generic,0.03599715,,True
538,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_6,bmosl,incorrect,generic,0.03347025,,True
539,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_2,bmosl,incorrect,generic,0.0142931999999999,,True
540,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_2,bmosl,correct,generic,0.03270675,,False
541,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_1,bmosl,incorrect,generic,0.0250452,,True
542,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_C_2025_6,bmosl,incorrect,generic,0.02114655,,True
543,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_5,bmosl,corrected,generic,0.0191748,,False
544,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_3,bmosl,correct,generic,0.0064031999999999,,False
545,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_5,bmosl,detected,generic,0.0040374,,False
546,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_6,bmosl,correct,generic,0.0246687,,False
547,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_2,bulgaria,correct,generic,0.0118054499999999,,False
548,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_7,bmosl,correct,generic,0.00711555,,False
549,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_3,bulgaria,correct,generic,0.0119157,,False
550,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_1,bulgaria,corrected,generic,0.01586895,,False
551,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_5,bulgaria,detected,generic,0.0013318499999999,,False
552,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_6,bulgaria,correct,generic,0.01784835,,False
553,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bulgaria_2025_4,bulgaria,incorrect,generic,0.0476914499999999,,True
554,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,canada_2025_1,canada,incorrect,generic,0.0102087,,True
555,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,canada_2025_2,canada,correct,generic,0.00364665,,False
556,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_G_2025_7,bmosl,incorrect,generic,0.01421505,,True
557,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,canada_2025_4,canada,incorrect,generic,0.0410223,,True
558,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,china_2025_2,china,incorrect,generic,0.02707515,,True
559,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,china_2025_1,china,correct,generic,0.01932705,,False
560,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,canada_2025_5,canada,correct,generic,0.01027845,,False
561,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,bmosl_NT_2025_4,bmosl,incorrect,generic,0.0431625,,True
562,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,china_2025_6,china,correct,generic,0.02936475,,False
563,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,china_2025_3,china,correct,generic,0.0180842999999999,,False
564,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_11,chinatst,correct,generic,0.0136187999999999,,False
565,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,canada_2025_3,canada,correct,generic,0.01634295,,False
566,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,china_2025_5,china,incorrect,generic,0.0228893999999999,,True
567,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_14,chinatst,detected,generic,0.00907785,,False
568,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_12,chinatst,incorrect,generic,0.0436722,,True
569,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_10,chinatst,correct,generic,0.0242308499999999,,False
570,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_15,chinatst,detected,generic,0.00406635,,False
571,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_13,chinatst,incorrect,generic,0.03933165,,True
572,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_16,chinatst,incorrect,generic,0.0384405,,True
573,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_20,chinatst,detected,generic,0.0004633499999999,,False
574,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_2,chinatst,incorrect,generic,0.0324309,,True
575,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_18,chinatst,detected,generic,0.009057,,False
576,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_1,chinatst,incorrect,generic,0.0343941,,True
577,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_17,chinatst,detected,generic,0.02517075,,False
578,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_21,chinatst,incorrect,generic,0.02295495,,True
579,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_19,chinatst,incorrect,generic,0.0140865,,True
580,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_3,chinatst,correct,generic,0.014859,,False
581,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_22,chinatst,incorrect,generic,0.0186139499999999,,True
582,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_7,chinatst,correct,generic,0.0082944,,False
583,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_6,chinatst,correct,generic,0.0069694499999999,,False
584,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_4,chinatst,correct,generic,0.04754115,,False
585,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_1,egmo,correct,generic,0.0142761,,False
586,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_8,chinatst,incorrect,generic,0.0276102,,True
587,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_9,chinatst,correct,generic,0.0247832999999999,,False
588,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_2,egmo,detected,generic,0.0048482999999999,,False
589,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_4,egmo,incorrect,generic,0.02799225,,True
590,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,chinatst_2025_5,chinatst,incorrect,generic,0.03215775,,True
591,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_3,egmo,corrected,generic,0.0138836999999999,,False
592,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_6,egmo,incorrect,generic,0.04785495,,True
593,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_3,elmosl,correct,generic,0.0268887,,False
594,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,egmo_2025_5,egmo,incorrect,generic,0.0202235999999999,,True
595,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_6,elmosl,correct,generic,0.00525855,,False
596,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_5,elmosl,incorrect,generic,0.0081441,,True
597,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_8,elmosl,correct,generic,0.01174695,,False
598,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_1,elmosl,incorrect,generic,0.0040842,,True
599,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_3,elmosl,correct,generic,0.0039954,,False
600,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_7,elmosl,incorrect,generic,0.01583985,,True
601,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_2,elmosl,incorrect,generic,0.0071376,,True
602,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_A_2025_1,elmosl,incorrect,generic,0.03524535,,True
603,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_4,elmosl,incorrect,generic,0.00580455,,True
604,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_7,elmosl,corrected,generic,0.0073085999999999,,False
605,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_6,elmosl,incorrect,generic,0.0222021,,True
606,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_8,elmosl,correct,generic,0.02915205,,False
607,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_5,elmosl,corrected,generic,0.0212106,,False
608,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_1,elmosl,incorrect,generic,0.0298146,,True
609,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_2,elmosl,correct,generic,0.03375465,,False
610,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_5,elmosl,correct,generic,0.0270088499999999,,False
611,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_3,elmosl,incorrect,generic,0.0179121,,True
612,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_2,elmosl,incorrect,generic,0.00306405,,True
613,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_C_2025_9,elmosl,incorrect,generic,0.0251554499999999,,True
614,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_7,elmosl,detected,generic,0.01053315,,False
615,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_8,elmosl,incorrect,generic,0.02119005,,True
616,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_4,elmosl,incorrect,generic,0.0214414499999999,,True
617,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_9,elmosl,incorrect,generic,0.03386415,,True
618,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0296951999999999,,True
619,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_1,elmosl,correct,generic,0.0258432,,False
620,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_4,elmosl,correct,generic,0.0565996499999999,,False
621,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_7,elmosl,incorrect,generic,0.00128355,,True
622,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_6,elmosl,correct,generic,0.00830355,,False
623,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,german_2025_3,german,detected,generic,0.013587,,False
624,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,german_2025_2,german,incorrect,generic,0.0585310499999999,,True
625,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_G_2025_6,elmosl,incorrect,generic,0.02199675,,True
626,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,greece_2025_1,greece,correct,generic,0.0040329,,False
627,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_5,elmosl,incorrect,generic,0.0086712,,True
628,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,greece_2025_3,greece,correct,generic,0.0146554499999999,,False
629,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,german_2025_4,german,correct,generic,0.02328615,,False
630,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,german_2025_1,german,corrected,generic,0.0069856499999999,,False
631,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,greece_2025_2,greece,correct,generic,0.0252625499999999,,False
632,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_1,imosl,correct,generic,0.0031032,,False
633,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,greece_2025_4,greece,correct,generic,0.0225924,,False
634,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_12,imosl,detected,generic,0.0080409,,False
635,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_15,imosl,correct,generic,0.0076705499999999,,False
636,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_11,imosl,incorrect,generic,0.0069009,,True
637,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_13,imosl,incorrect,generic,0.0146761499999999,,True
638,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_14,imosl,correct,generic,0.00780525,,False
639,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,elmosl_NT_2025_3,elmosl,incorrect,generic,0.0199970999999999,,True
640,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_17,imosl,detected,generic,0.0053828999999999,,False
641,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_18,imosl,correct,generic,0.03526575,,False
642,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_16,imosl,incorrect,generic,0.0130344,,True
643,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_2,imosl,correct,generic,0.0048860999999999,,False
644,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_10,imosl,incorrect,generic,0.0218207999999999,,True
645,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_24,imosl,incorrect,generic,0.0070861499999999,,True
646,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_19,imosl,incorrect,generic,0.0246234,,True
647,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_21,imosl,incorrect,generic,0.0513738,,True
648,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_22,imosl,incorrect,generic,0.0311058,,True
649,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_29,imosl,correct,generic,0.0144978,,False
650,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_20,imosl,correct,generic,0.0538574999999999,,False
651,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_3,imosl,correct,generic,0.0034010999999999,,False
652,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_25,imosl,correct,generic,0.0193921499999999,,False
653,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_26,imosl,correct,generic,0.03204435,,False
654,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_28,imosl,incorrect,generic,0.0240388499999999,,True
655,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_27,imosl,correct,generic,0.0178765499999999,,False
656,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_30,imosl,incorrect,generic,0.0145028999999999,,True
657,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_23,imosl,incorrect,generic,0.02727735,,True
658,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_33,imosl,detected,generic,0.00942675,,False
659,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_34,imosl,incorrect,generic,0.03311565,,True
660,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_35,imosl,incorrect,generic,0.0395260499999999,,True
661,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_5,imosl,correct,generic,0.0206872499999999,,False
662,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_6,imosl,corrected,generic,0.00311925,,False
663,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_9,imosl,correct,generic,0.0131072999999999,,False
664,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_4,imosl,incorrect,generic,0.0449141999999999,,True
665,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_31,imosl,incorrect,generic,0.01626915,,True
666,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_36,imosl,corrected,generic,0.0316330499999999,,False
667,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_2,india,correct,generic,0.0031213499999999,,False
668,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_7,imosl,correct,generic,0.00705495,,False
669,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_1,india,correct,generic,0.0079502999999999,,False
670,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_37,imosl,incorrect,generic,0.0190326,,True
671,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_4,india,correct,generic,0.02542125,,False
672,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_8,imosl,incorrect,generic,0.0309499499999999,,True
673,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_3,india,incorrect,generic,0.02600835,,True
674,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_10,india,incorrect,generic,0.0131923499999999,,True
675,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_1,india,detected,generic,0.00356175,,False
676,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_6,india,incorrect,generic,0.0320097,,True
677,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_12,india,incorrect,generic,0.0219557999999999,,True
678,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_11,india,incorrect,generic,0.0200521499999999,,True
679,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,imosl_2025_32,imosl,incorrect,generic,0.01561755,,True
680,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_17,india,correct,generic,0.0317553,,False
681,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_13,india,corrected,generic,0.01977435,,False
682,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_15,india,incorrect,generic,0.0148304999999999,,True
683,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_14,india,corrected,generic,0.0106782,,False
684,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_2025_5,india,incorrect,generic,0.0266453999999999,,True
685,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_18,india,incorrect,generic,0.0416269499999999,,True
686,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_20,india,incorrect,generic,0.0184073999999999,,True
687,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_2,india,incorrect,generic,0.0336504,,True
688,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_19,india,corrected,generic,0.0300357,,False
689,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_16,india,correct,generic,0.0298694999999999,,False
690,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_3,india,incorrect,generic,0.0088861499999999,,True
691,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_21,india,incorrect,generic,0.01735185,,True
692,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_5,india,correct,generic,0.0193898999999999,,False
693,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_4,india,correct,generic,0.0308148,,False
694,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_8,india,correct,generic,0.0154478999999999,,False
695,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_4,iran,detected,generic,0.02918985,,False
696,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_6,india,incorrect,generic,0.0205989,,True
697,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_9,india,correct,generic,0.0173405999999999,,False
698,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_2,iran,incorrect,generic,0.0107706,,True
699,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,india_prep_2025_7,india,incorrect,generic,0.0528729,,True
700,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_1,iran,detected,generic,0.030912,,False
701,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_3,iran,incorrect,generic,0.02343075,,True
702,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_9,iran,incorrect,generic,0.02817705,,True
703,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_7,iran,detected,generic,0.000642,,False
704,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_8,iran,incorrect,generic,0.0043624499999999,,True
705,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_1,israel,detected,generic,0.0067182,,False
706,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_5,iran,incorrect,generic,0.02577075,,True
707,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_2,israel,incorrect,generic,0.0276919499999999,,True
708,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_6,israel,incorrect,generic,0.0536231999999999,,True
709,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_4,israel,incorrect,generic,0.0524648999999999,,True
710,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_3,israel,detected,generic,0.00435585,,False
711,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_5,israel,detected,generic,0.00892905,,False
712,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_10,israel,incorrect,generic,0.01595085,,True
713,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_8,israel,incorrect,generic,0.0192899999999999,,True
714,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_1,izho,incorrect,generic,0.00473025,,True
715,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_7,israel,detected,generic,0.00675255,,False
716,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,iran_tst_2025_6,iran,incorrect,generic,0.01887915,,True
717,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_6,izho,correct,generic,0.0220359,,False
718,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,jbmo_2025_1,jbmo,detected,generic,0.0082879499999999,,False
719,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_3,izho,incorrect,generic,0.0385676999999999,,True
720,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,israel_tst_2025_9,israel,incorrect,generic,0.01990845,,True
721,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_2,izho,incorrect,generic,0.0430161,,True
722,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_4,izho,incorrect,generic,0.0072387,,True
723,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,izho_2025_5,izho,incorrect,generic,0.0348873,,True
724,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,jbmo_2025_3,jbmo,incorrect,generic,0.0237103499999999,,True
725,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_1,korea,correct,generic,0.0301730999999999,,False
726,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_2,korea,correct,generic,0.0166131,,False
727,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,jbmo_2025_2,jbmo,incorrect,generic,0.0253351499999999,,True
728,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_4,korea,incorrect,generic,0.03344085,,True
729,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_5,korea,correct,generic,0.0310724999999999,,False
730,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,jbmo_2025_4,jbmo,correct,generic,0.0219496499999999,,False
731,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_1,matharena,correct,matharena,0.00662955,,False
732,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_3,korea,incorrect,generic,0.0301734,,True
733,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_11,matharena,correct,matharena,0.014667,,False
734,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_12,matharena,correct,matharena,0.0197147999999999,,False
735,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_10,matharena,correct,matharena,0.01272075,,False
736,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,korea_2025_6,korea,incorrect,generic,0.0304124999999999,,True
737,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_17,matharena,correct,matharena,0.0048542999999999,,False
738,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_18,matharena,correct,matharena,0.03171765,,False
739,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_16,matharena,correct,matharena,0.0055886999999999,,False
740,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_19,matharena,correct,matharena,0.00940305,,False
741,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_20,matharena,correct,matharena,0.01468485,,False
742,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_15,matharena,incorrect,matharena,0.03541845,,True
743,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_14,matharena,incorrect,matharena,0.05564265,,True
744,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_23,matharena,correct,matharena,0.01174005,,False
745,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_2,matharena,correct,matharena,0.04092795,,False
746,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_24,matharena,correct,matharena,0.0096624,,False
747,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_22,matharena,correct,matharena,0.00852075,,False
748,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_21,matharena,correct,matharena,0.02851995,,False
749,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_25,matharena,correct,matharena,0.02562195,,False
750,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_13,matharena,corrected,matharena,0.0288105,,False
751,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_28,matharena,correct,matharena,0.03172245,,False
752,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_3,matharena,correct,matharena,0.01198125,,False
753,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_29,matharena,correct,matharena,0.0281823,,False
754,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_27,matharena,correct,matharena,0.0098058,,False
755,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_4,matharena,corrected,matharena,0.00636885,,False
756,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_26,matharena,correct,matharena,0.0155763,,False
757,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_30,matharena,correct,matharena,0.0340332,,False
758,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_5,matharena,correct,matharena,0.00699195,,False
759,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_7,matharena,correct,matharena,0.01539225,,False
760,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_8,matharena,correct,matharena,0.00790635,,False
761,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_11,matharena,correct,matharena,0.0254966999999999,,False
762,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_1,matharena,correct,matharena,0.00213885,,False
763,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_12,matharena,correct,matharena,0.0099412499999999,,False
764,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_9,matharena,correct,matharena,0.0297251999999999,,False
765,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_aime_aime_2025_6,matharena,correct,matharena,0.0092682,,False
766,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.00379035,,False
767,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_18,matharena,correct,matharena,0.0035832,,False
768,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_10,matharena,correct,matharena,0.01230045,,False
769,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_14,matharena,correct,matharena,0.00811635,,False
770,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_13,matharena,correct,matharena,0.03646485,,False
771,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_17,matharena,correct,matharena,0.0366537,,False
772,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_2,matharena,correct,matharena,0.0044797499999999,,False
773,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_21,matharena,correct,matharena,0.02096865,,False
774,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_23,matharena,correct,matharena,0.0039318,,False
775,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_20,matharena,correct,matharena,0.0056427,,False
776,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_26,matharena,correct,matharena,0.00656775,,False
777,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_24,matharena,correct,matharena,0.0232905,,False
778,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_19,matharena,correct,matharena,0.0089109,,False
779,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_15,matharena,correct,matharena,0.018561,,False
780,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_28,matharena,incorrect,matharena,0.0505136999999999,,True
781,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_29,matharena,correct,matharena,0.0131891999999999,,False
782,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_3,matharena,correct,matharena,0.0107796,,False
783,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_27,matharena,corrected,matharena,0.0131838,,False
784,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_25,matharena,correct,matharena,0.0116463,,False
785,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_4,matharena,correct,matharena,0.00479325,,False
786,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_5,matharena,correct,matharena,0.0133755,,False
787,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_6,matharena,correct,matharena,0.0119397,,False
788,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.01820055,,True
789,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_30,matharena,correct,matharena,0.0301675499999999,,False
790,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_8,matharena,correct,matharena,0.00873765,,False
791,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_9,matharena,correct,matharena,0.0041966999999999,,False
792,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_1,matharena,correct,matharena,0.00425085,,False
793,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.00638565,,False
794,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_10,matharena,correct,matharena,0.0055806,,False
795,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_13,matharena,correct,matharena,0.01301415,,False
796,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_12,matharena,correct,matharena,0.0065072999999999,,False
797,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_14,matharena,incorrect,matharena,0.016821,,True
798,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_16,matharena,correct,matharena,0.02221845,,False
799,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_17,matharena,correct,matharena,0.0084735,,False
800,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_11,matharena,incorrect,matharena,0.01361235,,True
801,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_2,matharena,correct,matharena,0.01269765,,False
802,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_19,matharena,correct,matharena,0.0098860499999999,,False
803,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_15,matharena,detected,matharena,0.00755805,,False
804,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_20,matharena,correct,matharena,0.02297205,,False
805,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.03911565,,True
806,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_22,matharena,correct,matharena,0.00836445,,False
807,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_21,matharena,correct,matharena,0.02660385,,False
808,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_23,matharena,detected,matharena,0.02301525,,False
809,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_27,matharena,correct,matharena,0.02469645,,False
810,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_25,matharena,detected,matharena,0.01682595,,False
811,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_3,matharena,correct,matharena,0.0074132999999999,,False
812,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_31,matharena,correct,matharena,0.0052379999999999,,False
813,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_28,matharena,correct,matharena,0.04319535,,False
814,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.010914,,True
815,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_24,matharena,correct,matharena,0.00857025,,False
816,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_26,matharena,correct,matharena,0.02138205,,False
817,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_34,matharena,correct,matharena,0.0334596,,False
818,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_29,matharena,corrected,matharena,0.01408425,,False
819,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_33,matharena,correct,matharena,0.0143095499999999,,False
820,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_37,matharena,correct,matharena,0.02595705,,False
821,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_38,matharena,correct,matharena,0.0429856499999999,,False
822,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_39,matharena,correct,matharena,0.0147874499999999,,False
823,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_35,matharena,correct,matharena,0.0158746499999999,,False
824,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_40,matharena,correct,matharena,0.01604415,,False
825,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_4,matharena,correct,matharena,0.0091330499999999,,False
826,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0116382,,True
827,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_5,matharena,incorrect,matharena,0.03271515,,True
828,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_7,matharena,correct,matharena,0.0294220499999999,,False
829,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_8,matharena,correct,matharena,0.0185240999999999,,False
830,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_36,matharena,correct,matharena,0.0190571999999999,,False
831,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_6,matharena,incorrect,matharena,0.01962165,,True
832,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_11,matharena,correct,matharena,0.02059845,,False
833,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_12,matharena,correct,matharena,0.02037135,,False
834,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_1,matharena,correct,matharena,0.0038364,,False
835,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_13,matharena,correct,matharena,0.02847315,,False
836,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.0539479499999999,,True
837,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_cmimc_cmimc_2025_9,matharena,incorrect,matharena,0.0223473,,True
838,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_16,matharena,correct,matharena,0.0237306,,False
839,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_14,matharena,correct,matharena,0.0187983,,False
840,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_15,matharena,correct,matharena,0.01879875,,False
841,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_2,matharena,corrected,matharena,0.01111425,,False
842,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.02427345,,True
843,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.0460975499999999,,True
844,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_22,matharena,correct,matharena,0.0115074,,False
845,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.00500745,,False
846,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_24,matharena,correct,matharena,0.0101000999999999,,False
847,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0175461,,False
848,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.06233535,,True
849,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_27,matharena,correct,matharena,0.03835665,,False
850,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_25,matharena,incorrect,matharena,0.0334954499999999,,True
851,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_28,matharena,correct,matharena,0.01305165,,False
852,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_26,matharena,correct,matharena,0.01727835,,False
853,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_29,matharena,detected,matharena,0.0121010999999999,,False
854,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_4,matharena,correct,matharena,0.0079505999999999,,False
855,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_5,matharena,correct,matharena,0.0133965,,False
856,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_3,matharena,incorrect,matharena,0.0150303,,True
857,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_6,matharena,incorrect,matharena,0.0132770999999999,,True
858,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_30,matharena,incorrect,matharena,0.0496651499999999,,True
859,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_8,matharena,detected,matharena,0.0018592499999999,,False
860,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_7,matharena,incorrect,matharena,0.0100552499999999,,True
861,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_1,matharena,correct,matharena,0.0052328999999999,,False
862,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_9,matharena,incorrect,matharena,0.0269285999999999,,True
863,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_10,matharena,correct,matharena,0.0072957,,False
864,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_11,matharena,detected,matharena,0.0341667,,False
865,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_13,matharena,correct,matharena,0.0145594499999999,,False
866,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_14,matharena,correct,matharena,0.0089025,,False
867,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_12,matharena,correct,matharena,0.0034776,,False
868,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_hmmt_hmmt_feb_2025_18,matharena,corrected,matharena,0.02237595,,False
869,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_15,matharena,correct,matharena,0.00234765,,False
870,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_16,matharena,correct,matharena,0.0048423,,False
871,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_17,matharena,correct,matharena,0.0102917999999999,,False
872,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_2,matharena,correct,matharena,0.0033258,,False
873,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_18,matharena,incorrect,matharena,0.03062955,,True
874,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.02101905,,True
875,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_21,matharena,correct,matharena,0.01818525,,False
876,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_25,matharena,correct,matharena,0.0086798999999999,,False
877,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_23,matharena,incorrect,matharena,0.0045691499999999,,True
878,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_26,matharena,correct,matharena,0.00774795,,False
879,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_24,matharena,detected,matharena,0.02532405,,False
880,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.04873245,,True
881,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.02351415,,True
882,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_19,matharena,correct,matharena,0.0263586,,False
883,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_3,matharena,correct,matharena,0.0050288999999999,,False
884,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_29,matharena,correct,matharena,0.02912925,,False
885,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_32,matharena,correct,matharena,0.0084193499999999,,False
886,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_30,matharena,correct,matharena,0.02462685,,False
887,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_36,matharena,correct,matharena,0.0112103999999999,,False
888,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_33,matharena,correct,matharena,0.0280215,,False
889,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_28,matharena,correct,matharena,0.0077931,,False
890,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_35,matharena,correct,matharena,0.0103131,,False
891,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_31,matharena,corrected,matharena,0.00464385,,False
892,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_37,matharena,correct,matharena,0.0178233,,False
893,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_38,matharena,incorrect,matharena,0.0185453999999999,,True
894,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_4,matharena,correct,matharena,0.00537225,,False
895,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_39,matharena,correct,matharena,0.01635585,,False
896,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.0207252,,True
897,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.0270963,,True
898,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_44,matharena,correct,matharena,0.0036516,,False
899,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_40,matharena,correct,matharena,0.0121012499999999,,False
900,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_45,matharena,correct,matharena,0.0030545999999999,,False
901,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_41,matharena,correct,matharena,0.0322401,,False
902,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_47,matharena,correct,matharena,0.00815625,,False
903,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_46,matharena,incorrect,matharena,0.0063894,,True
904,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_49,matharena,correct,matharena,0.00814395,,False
905,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_48,matharena,correct,matharena,0.0093859499999999,,False
906,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_5,matharena,correct,matharena,0.0045018,,False
907,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.04857825,,True
908,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_50,matharena,correct,matharena,0.00449385,,False
909,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_51,matharena,correct,matharena,0.01818645,,False
910,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_52,matharena,incorrect,matharena,0.0143199,,True
911,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_7,matharena,correct,matharena,0.0083877,,False
912,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_53,matharena,correct,matharena,0.02655315,,False
913,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,nordic_2025_3,nordic,correct,generic,0.03562935,,False
914,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,nordic_2025_1,nordic,correct,generic,0.0065607,,False
915,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_8,matharena,corrected,matharena,0.00614145,,False
916,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,nordic_2025_2,nordic,correct,generic,0.01370745,,False
917,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_9,matharena,correct,matharena,0.00543615,,False
918,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,matharena_smt_smt_2025_6,matharena,detected,matharena,0.00497445,,False
919,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_1,pan,incorrect,generic,0.01711815,,True
920,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_4,pan,correct,generic,0.0032129999999999,,False
921,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_3,pan,incorrect,generic,0.0031062,,True
922,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_2,pan,correct,generic,0.00440235,,False
923,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_5,philippines,correct,generic,0.0150596999999999,,False
924,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_3,philippines,correct,generic,0.0081658499999999,,False
925,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_2,philippines,corrected,generic,0.0113067,,False
926,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_4,philippines,correct,generic,0.03772185,,False
927,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_5,pan,correct,generic,0.02696235,,False
928,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_1,polish,correct,generic,0.00762255,,False
929,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_1,philippines,incorrect,generic,0.0414790499999999,,True
930,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_6,philippines,incorrect,generic,0.0209920499999999,,True
931,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_7,philippines,correct,generic,0.03625335,,False
932,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,philippines_2025_8,philippines,correct,generic,0.0360447,,False
933,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_2,polish,correct,generic,0.0045782999999999,,False
934,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_3,polish,correct,generic,0.0099633,,False
935,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_6,polish,detected,generic,0.00613395,,False
936,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_5,polish,correct,generic,0.0352996499999999,,False
937,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_2,rmm,correct,generic,0.0158046,,False
938,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,pan_african_2025_6,pan,incorrect,generic,0.00827385,,True
939,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_3,rmm,incorrect,generic,0.00754425,,True
940,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_5,rmm,incorrect,generic,0.0308735999999999,,True
941,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_10_2025_3,romania,correct,generic,0.01231155,,False
942,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,polish_2025_4,polish,detected,generic,0.0150476999999999,,False
943,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_4,rmm,incorrect,generic,0.0261834,,True
944,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_6,rmm,correct,generic,0.01792965,,False
945,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,rmm_2025_1,rmm,detected,generic,0.0107765999999999,,False
946,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_11_2025_2,romania,detected,generic,0.00110505,,False
947,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_10_2025_1,romania,correct,generic,0.0334922999999999,,False
948,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_10_2025_2,romania,corrected,generic,0.03379185,,False
949,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_11_2025_3,romania,correct,generic,0.0169769999999999,,False
950,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_12_2025_2,romania,correct,generic,0.00467235,,False
951,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_9_2025_1,romania,incorrect,generic,0.02459685,,True
952,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_11_2025_1,romania,correct,generic,0.01424985,,False
953,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_12_2025_1,romania,correct,generic,0.0166599,,False
954,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_9_2025_2,romania,correct,generic,0.0223989,,False
955,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_12_2025_3,romania,correct,generic,0.0194704499999999,,False
956,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_tst_2025_1,romania,corrected,generic,0.01348455,,False
957,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,serbia_tst_bmo_2025_1,serbia,correct,generic,0.01411035,,False
958,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_tst_2025_3,romania,incorrect,generic,0.0134634,,True
959,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,spain_2025_1,spain,correct,generic,0.0103608,,False
960,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,serbia_tst_bmo_2025_3,serbia,correct,generic,0.02271525,,False
961,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,romania_tst_2025_2,romania,incorrect,generic,0.00634515,,True
962,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,spain_2025_4,spain,incorrect,generic,0.0323787,,True
963,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,spain_2025_3,spain,detected,generic,0.0004131,,False
964,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,serbia_tst_bmo_2025_2,serbia,correct,generic,0.0282346499999999,,False
965,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_1,thai,correct,generic,0.00400035,,False
966,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,spain_2025_2,spain,correct,generic,0.0191838,,False
967,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_2,thai,incorrect,generic,0.0101170499999999,,True
968,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_10,thai,correct,generic,0.02133705,,False
969,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,serbia_tst_bmo_2025_4,serbia,corrected,generic,0.0321882,,False
970,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,spain_2025_5,spain,correct,generic,0.02224485,,False
971,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_3,thai,incorrect,generic,0.0205317,,True
972,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_5,thai,correct,generic,0.02040705,,False
973,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_8,thai,correct,generic,0.005535,,False
974,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_6,thai,correct,generic,0.0128713499999999,,False
975,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_9,thai,correct,generic,0.0107346,,False
976,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_4,thai,correct,generic,0.009597,,False
977,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,thai_2025_7,thai,correct,generic,0.0165123,,False
978,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_3,turkey,incorrect,generic,0.02872395,,True
979,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_4,turkey,correct,generic,0.01471275,,False
980,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_6,turkey,incorrect,generic,0.04382805,,True
981,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_2,turkey,incorrect,generic,0.0380794499999999,,True
982,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_9,turkey,detected,generic,0.0133996499999999,,False
983,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_1,usamo,correct,generic,0.00406335,,False
984,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_8,turkey,correct,generic,0.0099221999999999,,False
985,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_2,usamo,correct,generic,0.0043263,,False
986,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_7,turkey,incorrect,generic,0.0388429499999999,,True
987,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_4,usamo,correct,generic,0.0147082499999999,,False
988,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_1,turkey,detected,generic,0.0038260499999999,,False
989,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_6,usamo,detected,generic,0.0116456999999999,,False
990,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_1,usatst,detected,generic,0.0074026499999999,,False
991,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_5,usamo,detected,generic,0.02771835,,False
992,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_3,usatst,incorrect,generic,0.02784105,,True
993,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_4,usatst,correct,generic,0.0097515,,False
994,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_7,usatst,correct,generic,0.01515465,,False
995,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_6,usatst,detected,generic,0.0060471,,False
996,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,turkey_tst_2025_5,turkey,incorrect,generic,0.0084384,,True
997,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_2,usatst,incorrect,generic,0.01511265,,True
998,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_5,usatst,incorrect,generic,0.0305990999999999,,True
999,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_8,usatst,detected,generic,0.02015565,,False
1000,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_4,vietnam,incorrect,generic,0.0331720499999999,,True
1001,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usamo_2025_3,usamo,incorrect,generic,0.0129663,,True
1002,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_2,vietnam,detected,generic,0.01173405,,False
1003,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_3,vietnam,corrected,generic,0.0267532499999999,,False
1004,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_6,vietnam,correct,generic,0.01245285,,False
1005,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_1,vietnam,correct,generic,0.0071553,,False
1006,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,vietnam_2025_5,vietnam,correct,generic,0.02650395,,False
1007,GPT-5-mini (medium),GPT OSS 120B (high),openai/oss-120b,usatst_2025_9,usatst,incorrect,generic,0.03413625,,True
1008,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_12,allrussian,incorrect,generic,0.0,,True
1009,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_11,allrussian,incorrect,generic,0.0,,True
1010,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_13,allrussian,incorrect,generic,0.0,,True
1011,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_15,allrussian,incorrect,generic,0.0,,True
1012,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_14,allrussian,detected,generic,0.0,,False
1013,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_1,allrussian,incorrect,generic,0.0,,True
1014,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_10,allrussian,incorrect,generic,0.0,,True
1015,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_16,allrussian,incorrect,generic,0.0,,True
1016,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_3,allrussian,correct,generic,0.0,,False
1017,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_4,allrussian,incorrect,generic,0.0,,True
1018,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_6,allrussian,incorrect,generic,0.0,,True
1019,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_2,allrussian,incorrect,generic,0.0,,True
1020,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_7,allrussian,incorrect,generic,0.0,,True
1021,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_8,allrussian,incorrect,generic,0.0,,True
1022,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_5,allrussian,incorrect,generic,0.0,,True
1023,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmo_2025_2,bmo,incorrect,generic,0.0,,True
1024,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,allrussian_2025_9,allrussian,incorrect,generic,0.0,,True
1025,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmo_2025_3,bmo,incorrect,generic,0.0,,True
1026,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmo_2025_4,bmo,incorrect,generic,0.0,,True
1027,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_1,bmosl,incorrect,generic,0.0,,True
1028,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_4,bmosl,incorrect,generic,0.0,,True
1029,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_5,bmosl,incorrect,generic,0.0,,True
1030,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmo_2025_1,bmo,incorrect,generic,0.0,,True
1031,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_1,bmosl,incorrect,generic,0.0,,True
1032,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_6,bmosl,correct,generic,0.0,,False
1033,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_2,bmosl,incorrect,generic,0.0,,True
1034,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_5,bmosl,incorrect,generic,0.0,,True
1035,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_4,bmosl,incorrect,generic,0.0,,True
1036,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_2,bmosl,incorrect,generic,0.0,,True
1037,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_A_2025_3,bmosl,incorrect,generic,0.0,,True
1038,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_3,bmosl,incorrect,generic,0.0,,True
1039,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_3,bmosl,detected,generic,0.0,,False
1040,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_4,bmosl,incorrect,generic,0.0,,True
1041,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_6,bmosl,incorrect,generic,0.0,,True
1042,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_C_2025_6,bmosl,incorrect,generic,0.0,,True
1043,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_1,bmosl,incorrect,generic,0.0,,True
1044,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_2,bmosl,incorrect,generic,0.0,,True
1045,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_7,bmosl,correct,generic,0.0,,False
1046,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_1,bmosl,detected,generic,0.0,,False
1047,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_2,bmosl,correct,generic,0.0,,False
1048,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_4,bmosl,correct,generic,0.0,,False
1049,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_5,bmosl,incorrect,generic,0.0,,True
1050,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_6,bmosl,incorrect,generic,0.0,,True
1051,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_3,bmosl,incorrect,generic,0.0,,True
1052,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_1,bulgaria,corrected,generic,0.0,,False
1053,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_2,bulgaria,detected,generic,0.0,,False
1054,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_NT_2025_7,bmosl,detected,generic,0.0,,False
1055,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,canada_2025_3,canada,incorrect,generic,0.0,,True
1056,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_6,bulgaria,incorrect,generic,0.0,,True
1057,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_3,bulgaria,incorrect,generic,0.0,,True
1058,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_5,bulgaria,incorrect,generic,0.0,,True
1059,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bulgaria_2025_4,bulgaria,incorrect,generic,0.0,,True
1060,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,bmosl_G_2025_5,bmosl,corrected,generic,0.0,,False
1061,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,canada_2025_2,canada,incorrect,generic,0.0,,True
1062,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,canada_2025_4,canada,incorrect,generic,0.0,,True
1063,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,canada_2025_5,canada,incorrect,generic,0.0,,True
1064,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,china_2025_2,china,incorrect,generic,0.0,,True
1065,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,china_2025_5,china,incorrect,generic,0.0,,True
1066,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,china_2025_1,china,incorrect,generic,0.0,,True
1067,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_11,chinatst,incorrect,generic,0.0,,True
1068,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_12,chinatst,incorrect,generic,0.0,,True
1069,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,china_2025_3,china,incorrect,generic,0.0,,True
1070,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_14,chinatst,incorrect,generic,0.0,,True
1071,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,china_2025_6,china,incorrect,generic,0.0,,True
1072,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_13,chinatst,incorrect,generic,0.0,,True
1073,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_15,chinatst,incorrect,generic,0.0,,True
1074,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_1,chinatst,incorrect,generic,0.0,,True
1075,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_16,chinatst,incorrect,generic,0.0,,True
1076,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_10,chinatst,incorrect,generic,0.0,,True
1077,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_18,chinatst,incorrect,generic,0.0,,True
1078,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_19,chinatst,incorrect,generic,0.0,,True
1079,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,canada_2025_1,canada,incorrect,generic,0.0,,True
1080,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_17,chinatst,incorrect,generic,0.0,,True
1081,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_21,chinatst,incorrect,generic,0.0,,True
1082,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_4,chinatst,incorrect,generic,0.0,,True
1083,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_5,chinatst,incorrect,generic,0.0,,True
1084,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_20,chinatst,incorrect,generic,0.0,,True
1085,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_3,chinatst,incorrect,generic,0.0,,True
1086,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_7,chinatst,incorrect,generic,0.0,,True
1087,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_2,chinatst,incorrect,generic,0.0,,True
1088,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_8,chinatst,corrected,generic,0.0,,False
1089,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_6,chinatst,incorrect,generic,0.0,,True
1090,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_2,egmo,incorrect,generic,0.0,,True
1091,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_1,egmo,detected,generic,0.0,,False
1092,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_22,chinatst,incorrect,generic,0.0,,True
1093,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_3,egmo,incorrect,generic,0.0,,True
1094,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_1,elmosl,incorrect,generic,0.0,,True
1095,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_6,egmo,incorrect,generic,0.0,,True
1096,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_2,elmosl,corrected,generic,0.0,,False
1097,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_4,egmo,corrected,generic,0.0,,False
1098,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,egmo_2025_5,egmo,incorrect,generic,0.0,,True
1099,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_5,elmosl,incorrect,generic,0.0,,True
1100,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,chinatst_2025_9,chinatst,detected,generic,0.0,,False
1101,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_3,elmosl,incorrect,generic,0.0,,True
1102,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_2,elmosl,incorrect,generic,0.0,,True
1103,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_6,elmosl,incorrect,generic,0.0,,True
1104,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_8,elmosl,incorrect,generic,0.0,,True
1105,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_1,elmosl,incorrect,generic,0.0,,True
1106,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_5,elmosl,incorrect,generic,0.0,,True
1107,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_3,elmosl,correct,generic,0.0,,False
1108,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_A_2025_7,elmosl,incorrect,generic,0.0,,True
1109,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_7,elmosl,incorrect,generic,0.0,,True
1110,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_1,elmosl,incorrect,generic,0.0,,True
1111,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_9,elmosl,incorrect,generic,0.0,,True
1112,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_2,elmosl,incorrect,generic,0.0,,True
1113,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_6,elmosl,incorrect,generic,0.0,,True
1114,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_6,elmosl,incorrect,generic,0.0,,True
1115,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_3,elmosl,detected,generic,0.0,,False
1116,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_8,elmosl,incorrect,generic,0.0,,True
1117,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_4,elmosl,incorrect,generic,0.0,,True
1118,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_4,elmosl,incorrect,generic,0.0,,True
1119,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_C_2025_4,elmosl,incorrect,generic,0.0,,True
1120,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_9,elmosl,incorrect,generic,0.0,,True
1121,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_8,elmosl,incorrect,generic,0.0,,True
1122,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0,,True
1123,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_1,elmosl,incorrect,generic,0.0,,True
1124,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_7,elmosl,incorrect,generic,0.0,,True
1125,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_5,elmosl,incorrect,generic,0.0,,True
1126,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_6,elmosl,incorrect,generic,0.0,,True
1127,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,german_2025_1,german,correct,generic,0.0,,False
1128,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_3,elmosl,incorrect,generic,0.0,,True
1129,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,german_2025_3,german,detected,generic,0.0,,False
1130,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_NT_2025_5,elmosl,incorrect,generic,0.0,,True
1131,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,german_2025_2,german,incorrect,generic,0.0,,True
1132,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,greece_2025_2,greece,incorrect,generic,0.0,,True
1133,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_10,imosl,incorrect,generic,0.0,,True
1134,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,greece_2025_3,greece,detected,generic,0.0,,False
1135,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_11,imosl,incorrect,generic,0.0,,True
1136,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,german_2025_4,german,correct,generic,0.0,,False
1137,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,greece_2025_4,greece,detected,generic,0.0,,False
1138,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_13,imosl,correct,generic,0.0,,False
1139,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_1,imosl,detected,generic,0.0,,False
1140,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_15,imosl,incorrect,generic,0.0,,True
1141,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,greece_2025_1,greece,detected,generic,0.0,,False
1142,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_12,imosl,incorrect,generic,0.0,,True
1143,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_16,imosl,incorrect,generic,0.0,,True
1144,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_17,imosl,incorrect,generic,0.0,,True
1145,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_18,imosl,incorrect,generic,0.0,,True
1146,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_14,imosl,incorrect,generic,0.0,,True
1147,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_2,imosl,correct,generic,0.0,,False
1148,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,elmosl_G_2025_7,elmosl,incorrect,generic,0.0,,True
1149,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_19,imosl,incorrect,generic,0.0,,True
1150,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_22,imosl,incorrect,generic,0.0,,True
1151,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_20,imosl,incorrect,generic,0.0,,True
1152,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_24,imosl,incorrect,generic,0.0,,True
1153,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_25,imosl,incorrect,generic,0.0,,True
1154,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_26,imosl,incorrect,generic,0.0,,True
1155,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_21,imosl,incorrect,generic,0.0,,True
1156,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_27,imosl,corrected,generic,0.0,,False
1157,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_3,imosl,incorrect,generic,0.0,,True
1158,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_28,imosl,correct,generic,0.0,,False
1159,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_23,imosl,incorrect,generic,0.0,,True
1160,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_30,imosl,incorrect,generic,0.0,,True
1161,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_29,imosl,incorrect,generic,0.0,,True
1162,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_33,imosl,detected,generic,0.0,,False
1163,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_34,imosl,incorrect,generic,0.0,,True
1164,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_31,imosl,incorrect,generic,0.0,,True
1165,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_32,imosl,incorrect,generic,0.0,,True
1166,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_35,imosl,incorrect,generic,0.0,,True
1167,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_6,imosl,incorrect,generic,0.0,,True
1168,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_37,imosl,incorrect,generic,0.0,,True
1169,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_36,imosl,incorrect,generic,0.0,,True
1170,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_5,imosl,incorrect,generic,0.0,,True
1171,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_1,india,detected,generic,0.0,,False
1172,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_7,imosl,incorrect,generic,0.0,,True
1173,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_3,india,incorrect,generic,0.0,,True
1174,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_4,imosl,incorrect,generic,0.0,,True
1175,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_2,india,incorrect,generic,0.0,,True
1176,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_9,imosl,detected,generic,0.0,,False
1177,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_4,india,detected,generic,0.0,,False
1178,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,imosl_2025_8,imosl,incorrect,generic,0.0,,True
1179,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_5,india,incorrect,generic,0.0,,True
1180,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_12,india,incorrect,generic,0.0,,True
1181,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_11,india,incorrect,generic,0.0,,True
1182,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_2025_6,india,incorrect,generic,0.0,,True
1183,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_13,india,incorrect,generic,0.0,,True
1184,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_10,india,correct,generic,0.0,,False
1185,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_16,india,incorrect,generic,0.0,,True
1186,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_15,india,incorrect,generic,0.0,,True
1187,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_14,india,incorrect,generic,0.0,,True
1188,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_18,india,incorrect,generic,0.0,,True
1189,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_20,india,incorrect,generic,0.0,,True
1190,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_3,india,incorrect,generic,0.0,,True
1191,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_21,india,incorrect,generic,0.0,,True
1192,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_4,india,incorrect,generic,0.0,,True
1193,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_17,india,incorrect,generic,0.0,,True
1194,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_2,india,incorrect,generic,0.0,,True
1195,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_6,india,incorrect,generic,0.0,,True
1196,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_5,india,incorrect,generic,0.0,,True
1197,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_7,india,incorrect,generic,0.0,,True
1198,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_8,india,incorrect,generic,0.0,,True
1199,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_1,india,detected,generic,0.0,,False
1200,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_19,india,incorrect,generic,0.0,,True
1201,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_2,iran,incorrect,generic,0.0,,True
1202,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,india_prep_2025_9,india,correct,generic,0.0,,False
1203,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_1,iran,incorrect,generic,0.0,,True
1204,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_3,iran,incorrect,generic,0.0,,True
1205,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_6,iran,incorrect,generic,0.0,,True
1206,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_4,iran,incorrect,generic,0.0,,True
1207,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_9,iran,incorrect,generic,0.0,,True
1208,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_5,iran,incorrect,generic,0.0,,True
1209,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_1,israel,incorrect,generic,0.0,,True
1210,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_8,iran,incorrect,generic,0.0,,True
1211,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,iran_tst_2025_7,iran,incorrect,generic,0.0,,True
1212,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_2,israel,incorrect,generic,0.0,,True
1213,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_6,israel,incorrect,generic,0.0,,True
1214,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_3,israel,detected,generic,0.0,,False
1215,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_4,israel,incorrect,generic,0.0,,True
1216,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_5,israel,incorrect,generic,0.0,,True
1217,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_8,israel,incorrect,generic,0.0,,True
1218,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_7,israel,incorrect,generic,0.0,,True
1219,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_10,israel,incorrect,generic,0.0,,True
1220,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_2,izho,incorrect,generic,0.0,,True
1221,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_3,izho,incorrect,generic,0.0,,True
1222,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,israel_tst_2025_9,israel,incorrect,generic,0.0,,True
1223,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_4,izho,incorrect,generic,0.0,,True
1224,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,jbmo_2025_1,jbmo,detected,generic,0.0,,False
1225,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_5,izho,incorrect,generic,0.0,,True
1226,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,jbmo_2025_4,jbmo,corrected,generic,0.0,,False
1227,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,jbmo_2025_2,jbmo,incorrect,generic,0.0,,True
1228,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_1,izho,corrected,generic,0.0,,False
1229,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_2,korea,incorrect,generic,0.0,,True
1230,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_10,matharena,incorrect,matharena,0.0,,True
1231,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_1,matharena,incorrect,matharena,0.0,,True
1232,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,izho_2025_6,izho,detected,generic,0.0,,False
1233,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_3,korea,incorrect,generic,0.0,,True
1234,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,jbmo_2025_3,jbmo,incorrect,generic,0.0,,True
1235,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_6,korea,incorrect,generic,0.0,,True
1236,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_12,matharena,detected,matharena,0.0,,False
1237,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_13,matharena,detected,matharena,0.0,,False
1238,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_16,matharena,correct,matharena,0.0,,False
1239,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_5,korea,incorrect,generic,0.0,,True
1240,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_1,korea,incorrect,generic,0.0,,True
1241,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_17,matharena,correct,matharena,0.0,,False
1242,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_11,matharena,incorrect,matharena,0.0,,True
1243,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_14,matharena,incorrect,matharena,0.0,,True
1244,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_19,matharena,corrected,matharena,0.0,,False
1245,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,korea_2025_4,korea,incorrect,generic,0.0,,True
1246,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_18,matharena,incorrect,matharena,0.0,,True
1247,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_15,matharena,incorrect,matharena,0.0,,True
1248,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_21,matharena,detected,matharena,0.0,,False
1249,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_20,matharena,incorrect,matharena,0.0,,True
1250,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_22,matharena,detected,matharena,0.0,,False
1251,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_25,matharena,detected,matharena,0.0,,False
1252,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_24,matharena,incorrect,matharena,0.0,,True
1253,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_23,matharena,correct,matharena,0.0,,False
1254,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_28,matharena,corrected,matharena,0.0,,False
1255,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_29,matharena,correct,matharena,0.0,,False
1256,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_26,matharena,incorrect,matharena,0.0,,True
1257,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_2,matharena,correct,matharena,0.0,,False
1258,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_30,matharena,incorrect,matharena,0.0,,True
1259,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_5,matharena,correct,matharena,0.0,,False
1260,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_27,matharena,correct,matharena,0.0,,False
1261,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_4,matharena,incorrect,matharena,0.0,,True
1262,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_6,matharena,correct,matharena,0.0,,False
1263,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_1,matharena,correct,matharena,0.0,,False
1264,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_7,matharena,incorrect,matharena,0.0,,True
1265,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_11,matharena,correct,matharena,0.0,,False
1266,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_3,matharena,detected,matharena,0.0,,False
1267,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_8,matharena,corrected,matharena,0.0,,False
1268,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_10,matharena,incorrect,matharena,0.0,,True
1269,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_12,matharena,incorrect,matharena,0.0,,True
1270,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.0,,False
1271,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_15,matharena,incorrect,matharena,0.0,,True
1272,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_19,matharena,incorrect,matharena,0.0,,True
1273,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_17,matharena,detected,matharena,0.0,,False
1274,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_18,matharena,detected,matharena,0.0,,False
1275,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_13,matharena,incorrect,matharena,0.0,,True
1276,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_aime_aime_2025_9,matharena,incorrect,matharena,0.0,,True
1277,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_2,matharena,detected,matharena,0.0,,False
1278,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_21,matharena,incorrect,matharena,0.0,,True
1279,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_24,matharena,correct,matharena,0.0,,False
1280,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_14,matharena,correct,matharena,0.0,,False
1281,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_25,matharena,incorrect,matharena,0.0,,True
1282,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_23,matharena,incorrect,matharena,0.0,,True
1283,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_28,matharena,incorrect,matharena,0.0,,True
1284,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_26,matharena,correct,matharena,0.0,,False
1285,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_27,matharena,incorrect,matharena,0.0,,True
1286,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_29,matharena,incorrect,matharena,0.0,,True
1287,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_5,matharena,detected,matharena,0.0,,False
1288,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_20,matharena,incorrect,matharena,0.0,,True
1289,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.0,,True
1290,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_30,matharena,incorrect,matharena,0.0,,True
1291,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.0,,False
1292,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_3,matharena,incorrect,matharena,0.0,,True
1293,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_4,matharena,correct,matharena,0.0,,False
1294,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_9,matharena,correct,matharena,0.0,,False
1295,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_1,matharena,correct,matharena,0.0,,False
1296,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_6,matharena,detected,matharena,0.0,,False
1297,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_12,matharena,detected,matharena,0.0,,False
1298,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_11,matharena,incorrect,matharena,0.0,,True
1299,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_10,matharena,detected,matharena,0.0,,False
1300,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_13,matharena,correct,matharena,0.0,,False
1301,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_brumo_brumo_2025_8,matharena,correct,matharena,0.0,,False
1302,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_17,matharena,detected,matharena,0.0,,False
1303,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.0,,True
1304,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_15,matharena,incorrect,matharena,0.0,,True
1305,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_16,matharena,incorrect,matharena,0.0,,True
1306,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_2,matharena,correct,matharena,0.0,,False
1307,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_19,matharena,detected,matharena,0.0,,False
1308,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_14,matharena,incorrect,matharena,0.0,,True
1309,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_20,matharena,incorrect,matharena,0.0,,True
1310,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_26,matharena,corrected,matharena,0.0,,False
1311,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_25,matharena,incorrect,matharena,0.0,,True
1312,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_24,matharena,correct,matharena,0.0,,False
1313,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_21,matharena,incorrect,matharena,0.0,,True
1314,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_22,matharena,incorrect,matharena,0.0,,True
1315,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_31,matharena,correct,matharena,0.0,,False
1316,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_27,matharena,incorrect,matharena,0.0,,True
1317,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.0,,True
1318,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0,,True
1319,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_28,matharena,incorrect,matharena,0.0,,True
1320,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_29,matharena,incorrect,matharena,0.0,,True
1321,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_34,matharena,incorrect,matharena,0.0,,True
1322,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_37,matharena,incorrect,matharena,0.0,,True
1323,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_3,matharena,correct,matharena,0.0,,False
1324,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_38,matharena,incorrect,matharena,0.0,,True
1325,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_23,matharena,detected,matharena,0.0,,False
1326,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_36,matharena,incorrect,matharena,0.0,,True
1327,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_33,matharena,incorrect,matharena,0.0,,True
1328,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_5,matharena,detected,matharena,0.0,,False
1329,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_39,matharena,correct,matharena,0.0,,False
1330,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_40,matharena,incorrect,matharena,0.0,,True
1331,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_35,matharena,incorrect,matharena,0.0,,True
1332,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_8,matharena,incorrect,matharena,0.0,,True
1333,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_9,matharena,incorrect,matharena,0.0,,True
1334,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.0,,True
1335,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_1,matharena,correct,matharena,0.0,,False
1336,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_6,matharena,incorrect,matharena,0.0,,True
1337,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_13,matharena,incorrect,matharena,0.0,,True
1338,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_14,matharena,incorrect,matharena,0.0,,True
1339,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_4,matharena,incorrect,matharena,0.0,,True
1340,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_12,matharena,incorrect,matharena,0.0,,True
1341,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_15,matharena,incorrect,matharena,0.0,,True
1342,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_18,matharena,incorrect,matharena,0.0,,True
1343,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.0,,True
1344,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_2,matharena,correct,matharena,0.0,,False
1345,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.0,,True
1346,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_cmimc_cmimc_2025_7,matharena,incorrect,matharena,0.0,,True
1347,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_11,matharena,incorrect,matharena,0.0,,True
1348,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_22,matharena,incorrect,matharena,0.0,,True
1349,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0,,False
1350,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_16,matharena,incorrect,matharena,0.0,,True
1351,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.0,,False
1352,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_24,matharena,incorrect,matharena,0.0,,True
1353,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.0,,True
1354,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_25,matharena,incorrect,matharena,0.0,,True
1355,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_26,matharena,detected,matharena,0.0,,False
1356,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_27,matharena,correct,matharena,0.0,,False
1357,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_28,matharena,correct,matharena,0.0,,False
1358,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_3,matharena,incorrect,matharena,0.0,,True
1359,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_30,matharena,corrected,matharena,0.0,,False
1360,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_5,matharena,incorrect,matharena,0.0,,True
1361,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_6,matharena,detected,matharena,0.0,,False
1362,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_7,matharena,detected,matharena,0.0,,False
1363,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_4,matharena,incorrect,matharena,0.0,,True
1364,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_1,matharena,correct,matharena,0.0,,False
1365,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_8,matharena,detected,matharena,0.0,,False
1366,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_11,matharena,incorrect,matharena,0.0,,True
1367,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_9,matharena,incorrect,matharena,0.0,,True
1368,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_hmmt_hmmt_feb_2025_29,matharena,incorrect,matharena,0.0,,True
1369,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_13,matharena,incorrect,matharena,0.0,,True
1370,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_14,matharena,correct,matharena,0.0,,False
1371,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_10,matharena,correct,matharena,0.0,,False
1372,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_12,matharena,detected,matharena,0.0,,False
1373,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_15,matharena,correct,matharena,0.0,,False
1374,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_18,matharena,incorrect,matharena,0.0,,True
1375,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_16,matharena,incorrect,matharena,0.0,,True
1376,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_2,matharena,correct,matharena,0.0,,False
1377,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.0,,True
1378,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_21,matharena,incorrect,matharena,0.0,,True
1379,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_19,matharena,incorrect,matharena,0.0,,True
1380,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_24,matharena,corrected,matharena,0.0,,False
1381,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_26,matharena,correct,matharena,0.0,,False
1382,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.0,,True
1383,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_25,matharena,incorrect,matharena,0.0,,True
1384,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_23,matharena,incorrect,matharena,0.0,,True
1385,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_29,matharena,incorrect,matharena,0.0,,True
1386,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_3,matharena,correct,matharena,0.0,,False
1387,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_17,matharena,incorrect,matharena,0.0,,True
1388,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_28,matharena,correct,matharena,0.0,,False
1389,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_32,matharena,correct,matharena,0.0,,False
1390,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_30,matharena,incorrect,matharena,0.0,,True
1391,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_31,matharena,incorrect,matharena,0.0,,True
1392,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.0,,True
1393,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.0,,True
1394,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_38,matharena,incorrect,matharena,0.0,,True
1395,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_33,matharena,incorrect,matharena,0.0,,True
1396,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_37,matharena,incorrect,matharena,0.0,,True
1397,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_4,matharena,correct,matharena,0.0,,False
1398,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_35,matharena,correct,matharena,0.0,,False
1399,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_40,matharena,incorrect,matharena,0.0,,True
1400,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_41,matharena,incorrect,matharena,0.0,,True
1401,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_44,matharena,correct,matharena,0.0,,False
1402,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_39,matharena,detected,matharena,0.0,,False
1403,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.0,,True
1404,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.0,,True
1405,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_36,matharena,incorrect,matharena,0.0,,True
1406,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_46,matharena,correct,matharena,0.0,,False
1407,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_45,matharena,incorrect,matharena,0.0,,True
1408,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_47,matharena,correct,matharena,0.0,,False
1409,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_51,matharena,incorrect,matharena,0.0,,True
1410,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_48,matharena,correct,matharena,0.0,,False
1411,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_49,matharena,correct,matharena,0.0,,False
1412,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_7,matharena,correct,matharena,0.0,,False
1413,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_53,matharena,incorrect,matharena,0.0,,True
1414,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_5,matharena,incorrect,matharena,0.0,,True
1415,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_8,matharena,incorrect,matharena,0.0,,True
1416,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_6,matharena,incorrect,matharena,0.0,,True
1417,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_9,matharena,detected,matharena,0.0,,False
1418,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_50,matharena,correct,matharena,0.0,,False
1419,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,nordic_2025_1,nordic,correct,generic,0.0,,False
1420,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,matharena_smt_smt_2025_52,matharena,incorrect,matharena,0.0,,True
1421,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_2,pan,incorrect,generic,0.0,,True
1422,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_3,pan,corrected,generic,0.0,,False
1423,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_5,pan,incorrect,generic,0.0,,True
1424,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_6,pan,incorrect,generic,0.0,,True
1425,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,nordic_2025_2,nordic,detected,generic,0.0,,False
1426,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_4,pan,correct,generic,0.0,,False
1427,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_1,philippines,correct,generic,0.0,,False
1428,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,nordic_2025_3,nordic,detected,generic,0.0,,False
1429,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_2,philippines,incorrect,generic,0.0,,True
1430,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_3,philippines,detected,generic,0.0,,False
1431,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_5,philippines,incorrect,generic,0.0,,True
1432,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_4,philippines,incorrect,generic,0.0,,True
1433,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,pan_african_2025_1,pan,corrected,generic,0.0,,False
1434,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_7,philippines,incorrect,generic,0.0,,True
1435,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_8,philippines,correct,generic,0.0,,False
1436,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_2,polish,detected,generic,0.0,,False
1437,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_4,polish,incorrect,generic,0.0,,True
1438,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_1,polish,incorrect,generic,0.0,,True
1439,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,philippines_2025_6,philippines,detected,generic,0.0,,False
1440,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_1,rmm,incorrect,generic,0.0,,True
1441,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_6,polish,incorrect,generic,0.0,,True
1442,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_5,rmm,incorrect,generic,0.0,,True
1443,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_4,rmm,incorrect,generic,0.0,,True
1444,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_5,polish,incorrect,generic,0.0,,True
1445,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_3,rmm,incorrect,generic,0.0,,True
1446,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_2,rmm,incorrect,generic,0.0,,True
1447,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,rmm_2025_6,rmm,incorrect,generic,0.0,,True
1448,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,polish_2025_3,polish,incorrect,generic,0.0,,True
1449,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_10_2025_3,romania,detected,generic,0.0,,False
1450,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_10_2025_1,romania,incorrect,generic,0.0,,True
1451,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_10_2025_2,romania,incorrect,generic,0.0,,True
1452,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_11_2025_1,romania,correct,generic,0.0,,False
1453,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_12_2025_1,romania,incorrect,generic,0.0,,True
1454,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_11_2025_3,romania,incorrect,generic,0.0,,True
1455,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_11_2025_2,romania,corrected,generic,0.0,,False
1456,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_9_2025_1,romania,incorrect,generic,0.0,,True
1457,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_tst_2025_1,romania,incorrect,generic,0.0,,True
1458,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_12_2025_3,romania,incorrect,generic,0.0,,True
1459,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_9_2025_2,romania,incorrect,generic,0.0,,True
1460,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_12_2025_2,romania,incorrect,generic,0.0,,True
1461,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,serbia_tst_bmo_2025_2,serbia,incorrect,generic,0.0,,True
1462,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,spain_2025_1,spain,correct,generic,0.0,,False
1463,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,serbia_tst_bmo_2025_1,serbia,incorrect,generic,0.0,,True
1464,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_tst_2025_2,romania,incorrect,generic,0.0,,True
1465,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,romania_tst_2025_3,romania,incorrect,generic,0.0,,True
1466,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,serbia_tst_bmo_2025_4,serbia,incorrect,generic,0.0,,True
1467,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,spain_2025_4,spain,incorrect,generic,0.0,,True
1468,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,spain_2025_5,spain,corrected,generic,0.0,,False
1469,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,spain_2025_2,spain,incorrect,generic,0.0,,True
1470,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_1,thai,incorrect,generic,0.0,,True
1471,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_3,thai,detected,generic,0.0,,False
1472,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_10,thai,corrected,generic,0.0,,False
1473,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_2,thai,incorrect,generic,0.0,,True
1474,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,spain_2025_3,spain,incorrect,generic,0.0,,True
1475,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,serbia_tst_bmo_2025_3,serbia,incorrect,generic,0.0,,True
1476,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_8,thai,incorrect,generic,0.0,,True
1477,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_5,thai,incorrect,generic,0.0,,True
1478,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_6,thai,incorrect,generic,0.0,,True
1479,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_9,thai,incorrect,generic,0.0,,True
1480,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_2,turkey,incorrect,generic,0.0,,True
1481,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_7,thai,incorrect,generic,0.0,,True
1482,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,thai_2025_4,thai,correct,generic,0.0,,False
1483,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_1,turkey,incorrect,generic,0.0,,True
1484,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_6,turkey,incorrect,generic,0.0,,True
1485,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_9,turkey,incorrect,generic,0.0,,True
1486,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_4,turkey,incorrect,generic,0.0,,True
1487,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_3,turkey,incorrect,generic,0.0,,True
1488,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_7,turkey,incorrect,generic,0.0,,True
1489,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_4,usamo,correct,generic,0.0,,False
1490,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_5,turkey,incorrect,generic,0.0,,True
1491,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,turkey_tst_2025_8,turkey,incorrect,generic,0.0,,True
1492,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_2,usamo,incorrect,generic,0.0,,True
1493,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_1,usamo,incorrect,generic,0.0,,True
1494,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_3,usamo,incorrect,generic,0.0,,True
1495,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_5,usamo,incorrect,generic,0.0,,True
1496,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_4,usatst,incorrect,generic,0.0,,True
1497,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_1,usatst,incorrect,generic,0.0,,True
1498,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usamo_2025_6,usamo,incorrect,generic,0.0,,True
1499,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_2,usatst,incorrect,generic,0.0,,True
1500,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_7,usatst,correct,generic,0.0,,False
1501,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_5,usatst,incorrect,generic,0.0,,True
1502,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_8,usatst,incorrect,generic,0.0,,True
1503,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_6,usatst,incorrect,generic,0.0,,True
1504,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_3,usatst,incorrect,generic,0.0,,True
1505,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,usatst_2025_9,usatst,incorrect,generic,0.0,,True
1506,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_4,vietnam,incorrect,generic,0.0,,True
1507,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_3,vietnam,incorrect,generic,0.0,,True
1508,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_6,vietnam,detected,generic,0.0,,False
1509,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_2,vietnam,correct,generic,0.0,,False
1510,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_1,vietnam,correct,generic,0.0,,False
1511,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b,vietnam_2025_5,vietnam,incorrect,generic,0.0,,True
1512,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_11,allrussian,incorrect,generic,0.0,,True
1513,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_13,allrussian,incorrect,generic,0.0,,True
1514,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_15,allrussian,incorrect,generic,0.0,,True
1515,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_1,allrussian,incorrect,generic,0.0,,True
1516,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_14,allrussian,incorrect,generic,0.0,,True
1517,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_10,allrussian,incorrect,generic,0.0,,True
1518,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_2,allrussian,incorrect,generic,0.0,,True
1519,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_12,allrussian,incorrect,generic,0.0,,True
1520,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_3,allrussian,incorrect,generic,0.0,,True
1521,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_6,allrussian,incorrect,generic,0.0,,True
1522,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_7,allrussian,incorrect,generic,0.0,,True
1523,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_8,allrussian,incorrect,generic,0.0,,True
1524,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_5,allrussian,incorrect,generic,0.0,,True
1525,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_16,allrussian,incorrect,generic,0.0,,True
1526,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_9,allrussian,corrected,generic,0.0,,False
1527,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmo_2025_2,bmo,incorrect,generic,0.0,,True
1528,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmo_2025_3,bmo,corrected,generic,0.0,,False
1529,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmo_2025_1,bmo,incorrect,generic,0.0,,True
1530,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_3,bmosl,corrected,generic,0.0,,False
1531,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,allrussian_2025_4,allrussian,incorrect,generic,0.0,,True
1532,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_6,bmosl,correct,generic,0.0,,False
1533,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmo_2025_4,bmo,incorrect,generic,0.0,,True
1534,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_4,bmosl,detected,generic,0.0,,False
1535,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_1,bmosl,correct,generic,0.0,,False
1536,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_1,bmosl,incorrect,generic,0.0,,True
1537,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_2,bmosl,incorrect,generic,0.0,,True
1538,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_5,bmosl,incorrect,generic,0.0,,True
1539,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_5,bmosl,incorrect,generic,0.0,,True
1540,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_4,bmosl,incorrect,generic,0.0,,True
1541,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_6,bmosl,incorrect,generic,0.0,,True
1542,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_1,bmosl,incorrect,generic,0.0,,True
1543,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_2,bmosl,incorrect,generic,0.0,,True
1544,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_5,bmosl,incorrect,generic,0.0,,True
1545,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_A_2025_2,bmosl,incorrect,generic,0.0,,True
1546,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_4,bmosl,incorrect,generic,0.0,,True
1547,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_6,bmosl,incorrect,generic,0.0,,True
1548,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_7,bmosl,incorrect,generic,0.0,,True
1549,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_C_2025_3,bmosl,incorrect,generic,0.0,,True
1550,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_G_2025_3,bmosl,detected,generic,0.0,,False
1551,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_1,bmosl,incorrect,generic,0.0,,True
1552,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_2,bmosl,incorrect,generic,0.0,,True
1553,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_4,bmosl,incorrect,generic,0.0,,True
1554,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_5,bmosl,incorrect,generic,0.0,,True
1555,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_3,bmosl,incorrect,generic,0.0,,True
1556,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_7,bmosl,incorrect,generic,0.0,,True
1557,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bmosl_NT_2025_6,bmosl,incorrect,generic,0.0,,True
1558,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_3,bulgaria,incorrect,generic,0.0,,True
1559,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_1,bulgaria,corrected,generic,0.0,,False
1560,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_2,bulgaria,correct,generic,0.0,,False
1561,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_5,bulgaria,detected,generic,0.0,,False
1562,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_4,bulgaria,incorrect,generic,0.0,,True
1563,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,canada_2025_1,canada,detected,generic,0.0,,False
1564,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,canada_2025_2,canada,incorrect,generic,0.0,,True
1565,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,canada_2025_4,canada,incorrect,generic,0.0,,True
1566,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,china_2025_2,china,incorrect,generic,0.0,,True
1567,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,canada_2025_5,canada,incorrect,generic,0.0,,True
1568,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,bulgaria_2025_6,bulgaria,incorrect,generic,0.0,,True
1569,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,canada_2025_3,canada,detected,generic,0.0,,False
1570,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,china_2025_5,china,incorrect,generic,0.0,,True
1571,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,china_2025_1,china,incorrect,generic,0.0,,True
1572,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,china_2025_6,china,incorrect,generic,0.0,,True
1573,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_1,chinatst,incorrect,generic,0.0,,True
1574,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_12,chinatst,incorrect,generic,0.0,,True
1575,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,china_2025_3,china,incorrect,generic,0.0,,True
1576,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_10,chinatst,incorrect,generic,0.0,,True
1577,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_11,chinatst,incorrect,generic,0.0,,True
1578,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_15,chinatst,incorrect,generic,0.0,,True
1579,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_14,chinatst,incorrect,generic,0.0,,True
1580,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_13,chinatst,incorrect,generic,0.0,,True
1581,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_2,chinatst,incorrect,generic,0.0,,True
1582,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_17,chinatst,incorrect,generic,0.0,,True
1583,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_19,chinatst,incorrect,generic,0.0,,True
1584,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_18,chinatst,incorrect,generic,0.0,,True
1585,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_16,chinatst,incorrect,generic,0.0,,True
1586,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_21,chinatst,incorrect,generic,0.0,,True
1587,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_4,chinatst,incorrect,generic,0.0,,True
1588,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_20,chinatst,incorrect,generic,0.0,,True
1589,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_3,chinatst,incorrect,generic,0.0,,True
1590,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_5,chinatst,incorrect,generic,0.0,,True
1591,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_22,chinatst,incorrect,generic,0.0,,True
1592,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_7,chinatst,incorrect,generic,0.0,,True
1593,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_6,chinatst,incorrect,generic,0.0,,True
1594,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_8,chinatst,incorrect,generic,0.0,,True
1595,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_1,egmo,incorrect,generic,0.0,,True
1596,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_2,egmo,corrected,generic,0.0,,False
1597,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,chinatst_2025_9,chinatst,detected,generic,0.0,,False
1598,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_4,egmo,incorrect,generic,0.0,,True
1599,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_3,egmo,incorrect,generic,0.0,,True
1600,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_6,egmo,incorrect,generic,0.0,,True
1601,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_3,elmosl,incorrect,generic,0.0,,True
1602,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,egmo_2025_5,egmo,incorrect,generic,0.0,,True
1603,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_1,elmosl,incorrect,generic,0.0,,True
1604,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_2,elmosl,incorrect,generic,0.0,,True
1605,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_6,elmosl,incorrect,generic,0.0,,True
1606,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_7,elmosl,incorrect,generic,0.0,,True
1607,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_3,elmosl,correct,generic,0.0,,False
1608,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_7,elmosl,incorrect,generic,0.0,,True
1609,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_2,elmosl,incorrect,generic,0.0,,True
1610,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_5,elmosl,incorrect,generic,0.0,,True
1611,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_6,elmosl,incorrect,generic,0.0,,True
1612,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_5,elmosl,incorrect,generic,0.0,,True
1613,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_A_2025_8,elmosl,incorrect,generic,0.0,,True
1614,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_1,elmosl,incorrect,generic,0.0,,True
1615,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_1,elmosl,incorrect,generic,0.0,,True
1616,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_8,elmosl,incorrect,generic,0.0,,True
1617,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_9,elmosl,incorrect,generic,0.0,,True
1618,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_C_2025_4,elmosl,incorrect,generic,0.0,,True
1619,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_3,elmosl,detected,generic,0.0,,False
1620,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_6,elmosl,incorrect,generic,0.0,,True
1621,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_5,elmosl,incorrect,generic,0.0,,True
1622,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_2,elmosl,incorrect,generic,0.0,,True
1623,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_4,elmosl,incorrect,generic,0.0,,True
1624,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_9,elmosl,incorrect,generic,0.0,,True
1625,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_8,elmosl,incorrect,generic,0.0,,True
1626,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_1,elmosl,incorrect,generic,0.0,,True
1627,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_3,elmosl,incorrect,generic,0.0,,True
1628,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0,,True
1629,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_6,elmosl,detected,generic,0.0,,False
1630,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_4,elmosl,incorrect,generic,0.0,,True
1631,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,german_2025_1,german,correct,generic,0.0,,False
1632,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_5,elmosl,incorrect,generic,0.0,,True
1633,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_NT_2025_7,elmosl,incorrect,generic,0.0,,True
1634,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,german_2025_4,german,correct,generic,0.0,,False
1635,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,greece_2025_2,greece,incorrect,generic,0.0,,True
1636,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,elmosl_G_2025_7,elmosl,incorrect,generic,0.0,,True
1637,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,greece_2025_4,greece,detected,generic,0.0,,False
1638,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,german_2025_2,german,incorrect,generic,0.0,,True
1639,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,german_2025_3,german,incorrect,generic,0.0,,True
1640,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,greece_2025_3,greece,incorrect,generic,0.0,,True
1641,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_10,imosl,incorrect,generic,0.0,,True
1642,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_1,imosl,correct,generic,0.0,,False
1643,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,greece_2025_1,greece,correct,generic,0.0,,False
1644,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_15,imosl,incorrect,generic,0.0,,True
1645,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_17,imosl,incorrect,generic,0.0,,True
1646,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_11,imosl,incorrect,generic,0.0,,True
1647,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_12,imosl,incorrect,generic,0.0,,True
1648,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_19,imosl,incorrect,generic,0.0,,True
1649,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_2,imosl,correct,generic,0.0,,False
1650,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_18,imosl,detected,generic,0.0,,False
1651,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_20,imosl,incorrect,generic,0.0,,True
1652,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_16,imosl,incorrect,generic,0.0,,True
1653,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_22,imosl,incorrect,generic,0.0,,True
1654,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_23,imosl,incorrect,generic,0.0,,True
1655,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_14,imosl,incorrect,generic,0.0,,True
1656,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_25,imosl,incorrect,generic,0.0,,True
1657,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_24,imosl,incorrect,generic,0.0,,True
1658,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_21,imosl,incorrect,generic,0.0,,True
1659,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_27,imosl,incorrect,generic,0.0,,True
1660,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_29,imosl,incorrect,generic,0.0,,True
1661,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_28,imosl,incorrect,generic,0.0,,True
1662,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_34,imosl,incorrect,generic,0.0,,True
1663,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_3,imosl,incorrect,generic,0.0,,True
1664,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_32,imosl,incorrect,generic,0.0,,True
1665,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_30,imosl,incorrect,generic,0.0,,True
1666,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_31,imosl,incorrect,generic,0.0,,True
1667,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_13,imosl,incorrect,generic,0.0,,True
1668,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_26,imosl,incorrect,generic,0.0,,True
1669,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_35,imosl,incorrect,generic,0.0,,True
1670,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_33,imosl,incorrect,generic,0.0,,True
1671,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_36,imosl,incorrect,generic,0.0,,True
1672,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_4,imosl,incorrect,generic,0.0,,True
1673,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_37,imosl,incorrect,generic,0.0,,True
1674,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_5,imosl,incorrect,generic,0.0,,True
1675,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_7,imosl,incorrect,generic,0.0,,True
1676,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_8,imosl,incorrect,generic,0.0,,True
1677,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_9,imosl,incorrect,generic,0.0,,True
1678,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,imosl_2025_6,imosl,incorrect,generic,0.0,,True
1679,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_3,india,incorrect,generic,0.0,,True
1680,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_5,india,incorrect,generic,0.0,,True
1681,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_4,india,incorrect,generic,0.0,,True
1682,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_12,india,incorrect,generic,0.0,,True
1683,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_6,india,incorrect,generic,0.0,,True
1684,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_1,india,correct,generic,0.0,,False
1685,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_11,india,incorrect,generic,0.0,,True
1686,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_10,india,incorrect,generic,0.0,,True
1687,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_13,india,incorrect,generic,0.0,,True
1688,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_2025_2,india,incorrect,generic,0.0,,True
1689,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_14,india,incorrect,generic,0.0,,True
1690,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_1,india,incorrect,generic,0.0,,True
1691,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_16,india,correct,generic,0.0,,False
1692,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_18,india,incorrect,generic,0.0,,True
1693,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_2,india,incorrect,generic,0.0,,True
1694,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_17,india,incorrect,generic,0.0,,True
1695,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_15,india,incorrect,generic,0.0,,True
1696,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_20,india,incorrect,generic,0.0,,True
1697,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_21,india,incorrect,generic,0.0,,True
1698,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_4,india,incorrect,generic,0.0,,True
1699,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_3,india,incorrect,generic,0.0,,True
1700,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_6,india,incorrect,generic,0.0,,True
1701,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_19,india,incorrect,generic,0.0,,True
1702,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_5,india,incorrect,generic,0.0,,True
1703,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_7,india,incorrect,generic,0.0,,True
1704,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_2,iran,incorrect,generic,0.0,,True
1705,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_1,iran,incorrect,generic,0.0,,True
1706,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_9,india,correct,generic,0.0,,False
1707,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,india_prep_2025_8,india,incorrect,generic,0.0,,True
1708,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_6,iran,incorrect,generic,0.0,,True
1709,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_4,iran,incorrect,generic,0.0,,True
1710,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_9,iran,incorrect,generic,0.0,,True
1711,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_8,iran,incorrect,generic,0.0,,True
1712,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_5,iran,incorrect,generic,0.0,,True
1713,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_1,israel,incorrect,generic,0.0,,True
1714,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_7,iran,incorrect,generic,0.0,,True
1715,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,iran_tst_2025_3,iran,incorrect,generic,0.0,,True
1716,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_2,israel,incorrect,generic,0.0,,True
1717,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_6,israel,incorrect,generic,0.0,,True
1718,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_5,israel,incorrect,generic,0.0,,True
1719,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_7,israel,incorrect,generic,0.0,,True
1720,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_8,israel,incorrect,generic,0.0,,True
1721,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_9,israel,incorrect,generic,0.0,,True
1722,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_4,israel,incorrect,generic,0.0,,True
1723,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_3,israel,detected,generic,0.0,,False
1724,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,israel_tst_2025_10,israel,incorrect,generic,0.0,,True
1725,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_3,izho,incorrect,generic,0.0,,True
1726,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,jbmo_2025_1,jbmo,detected,generic,0.0,,False
1727,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_6,izho,incorrect,generic,0.0,,True
1728,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_2,izho,incorrect,generic,0.0,,True
1729,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_4,izho,incorrect,generic,0.0,,True
1730,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_5,izho,incorrect,generic,0.0,,True
1731,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,jbmo_2025_4,jbmo,corrected,generic,0.0,,False
1732,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_1,korea,incorrect,generic,0.0,,True
1733,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,jbmo_2025_2,jbmo,correct,generic,0.0,,False
1734,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,jbmo_2025_3,jbmo,incorrect,generic,0.0,,True
1735,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_2,korea,detected,generic,0.0,,False
1736,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_4,korea,incorrect,generic,0.0,,True
1737,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_3,korea,incorrect,generic,0.0,,True
1738,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,izho_2025_1,izho,incorrect,generic,0.0,,True
1739,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_11,matharena,incorrect,matharena,0.0,,True
1740,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_5,korea,incorrect,generic,0.0,,True
1741,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_1,matharena,detected,matharena,0.0,,False
1742,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_10,matharena,detected,matharena,0.0,,False
1743,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_12,matharena,detected,matharena,0.0,,False
1744,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_13,matharena,incorrect,matharena,0.0,,True
1745,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_14,matharena,incorrect,matharena,0.0,,True
1746,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_16,matharena,detected,matharena,0.0,,False
1747,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_15,matharena,incorrect,matharena,0.0,,True
1748,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_18,matharena,correct,matharena,0.0,,False
1749,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_17,matharena,correct,matharena,0.0,,False
1750,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_19,matharena,detected,matharena,0.0,,False
1751,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_2,matharena,correct,matharena,0.0,,False
1752,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,korea_2025_6,korea,incorrect,generic,0.0,,True
1753,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_21,matharena,incorrect,matharena,0.0,,True
1754,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_20,matharena,detected,matharena,0.0,,False
1755,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_25,matharena,detected,matharena,0.0,,False
1756,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_24,matharena,incorrect,matharena,0.0,,True
1757,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_23,matharena,incorrect,matharena,0.0,,True
1758,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_22,matharena,correct,matharena,0.0,,False
1759,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_26,matharena,detected,matharena,0.0,,False
1760,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_28,matharena,incorrect,matharena,0.0,,True
1761,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_30,matharena,incorrect,matharena,0.0,,True
1762,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_29,matharena,incorrect,matharena,0.0,,True
1763,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_27,matharena,detected,matharena,0.0,,False
1764,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_3,matharena,correct,matharena,0.0,,False
1765,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_4,matharena,correct,matharena,0.0,,False
1766,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_7,matharena,corrected,matharena,0.0,,False
1767,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_6,matharena,correct,matharena,0.0,,False
1768,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_5,matharena,correct,matharena,0.0,,False
1769,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_1,matharena,detected,matharena,0.0,,False
1770,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_10,matharena,incorrect,matharena,0.0,,True
1771,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_9,matharena,detected,matharena,0.0,,False
1772,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_11,matharena,correct,matharena,0.0,,False
1773,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_12,matharena,incorrect,matharena,0.0,,True
1774,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.0,,False
1775,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_13,matharena,incorrect,matharena,0.0,,True
1776,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_aime_aime_2025_8,matharena,correct,matharena,0.0,,False
1777,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_14,matharena,incorrect,matharena,0.0,,True
1778,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_15,matharena,correct,matharena,0.0,,False
1779,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_18,matharena,correct,matharena,0.0,,False
1780,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_2,matharena,correct,matharena,0.0,,False
1781,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_19,matharena,correct,matharena,0.0,,False
1782,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_17,matharena,detected,matharena,0.0,,False
1783,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_21,matharena,detected,matharena,0.0,,False
1784,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.0,,True
1785,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_26,matharena,correct,matharena,0.0,,False
1786,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_24,matharena,incorrect,matharena,0.0,,True
1787,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_20,matharena,incorrect,matharena,0.0,,True
1788,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_27,matharena,incorrect,matharena,0.0,,True
1789,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_25,matharena,detected,matharena,0.0,,False
1790,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_28,matharena,incorrect,matharena,0.0,,True
1791,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_3,matharena,detected,matharena,0.0,,False
1792,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_30,matharena,incorrect,matharena,0.0,,True
1793,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_23,matharena,correct,matharena,0.0,,False
1794,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_29,matharena,corrected,matharena,0.0,,False
1795,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_6,matharena,detected,matharena,0.0,,False
1796,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.0,,False
1797,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_5,matharena,incorrect,matharena,0.0,,True
1798,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_8,matharena,correct,matharena,0.0,,False
1799,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_1,matharena,correct,matharena,0.0,,False
1800,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_9,matharena,detected,matharena,0.0,,False
1801,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_brumo_brumo_2025_4,matharena,correct,matharena,0.0,,False
1802,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_13,matharena,correct,matharena,0.0,,False
1803,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_10,matharena,correct,matharena,0.0,,False
1804,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_12,matharena,correct,matharena,0.0,,False
1805,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_15,matharena,detected,matharena,0.0,,False
1806,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_17,matharena,correct,matharena,0.0,,False
1807,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.0,,True
1808,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_11,matharena,detected,matharena,0.0,,False
1809,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_14,matharena,correct,matharena,0.0,,False
1810,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_2,matharena,correct,matharena,0.0,,False
1811,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_19,matharena,incorrect,matharena,0.0,,True
1812,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_16,matharena,incorrect,matharena,0.0,,True
1813,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_21,matharena,incorrect,matharena,0.0,,True
1814,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_20,matharena,incorrect,matharena,0.0,,True
1815,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_24,matharena,correct,matharena,0.0,,False
1816,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_22,matharena,detected,matharena,0.0,,False
1817,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_25,matharena,incorrect,matharena,0.0,,True
1818,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_23,matharena,correct,matharena,0.0,,False
1819,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_27,matharena,incorrect,matharena,0.0,,True
1820,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_26,matharena,correct,matharena,0.0,,False
1821,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_28,matharena,incorrect,matharena,0.0,,True
1822,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_31,matharena,detected,matharena,0.0,,False
1823,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.0,,True
1824,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_3,matharena,detected,matharena,0.0,,False
1825,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_34,matharena,incorrect,matharena,0.0,,True
1826,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_36,matharena,incorrect,matharena,0.0,,True
1827,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_38,matharena,incorrect,matharena,0.0,,True
1828,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_33,matharena,incorrect,matharena,0.0,,True
1829,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_29,matharena,incorrect,matharena,0.0,,True
1830,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_37,matharena,detected,matharena,0.0,,False
1831,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_4,matharena,correct,matharena,0.0,,False
1832,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_5,matharena,incorrect,matharena,0.0,,True
1833,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_39,matharena,detected,matharena,0.0,,False
1834,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_40,matharena,incorrect,matharena,0.0,,True
1835,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_35,matharena,correct,matharena,0.0,,False
1836,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_8,matharena,correct,matharena,0.0,,False
1837,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0,,True
1838,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_9,matharena,incorrect,matharena,0.0,,True
1839,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_6,matharena,incorrect,matharena,0.0,,True
1840,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_1,matharena,detected,matharena,0.0,,False
1841,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_cmimc_cmimc_2025_7,matharena,incorrect,matharena,0.0,,True
1842,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_13,matharena,incorrect,matharena,0.0,,True
1843,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.0,,True
1844,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_11,matharena,incorrect,matharena,0.0,,True
1845,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_16,matharena,corrected,matharena,0.0,,False
1846,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_15,matharena,incorrect,matharena,0.0,,True
1847,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_14,matharena,incorrect,matharena,0.0,,True
1848,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.0,,True
1849,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_18,matharena,incorrect,matharena,0.0,,True
1850,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_2,matharena,correct,matharena,0.0,,False
1851,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.0,,True
1852,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.0,,True
1853,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.0,,False
1854,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0,,False
1855,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_12,matharena,incorrect,matharena,0.0,,True
1856,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_26,matharena,correct,matharena,0.0,,False
1857,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_22,matharena,incorrect,matharena,0.0,,True
1858,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_28,matharena,detected,matharena,0.0,,False
1859,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_27,matharena,incorrect,matharena,0.0,,True
1860,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_3,matharena,detected,matharena,0.0,,False
1861,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_25,matharena,incorrect,matharena,0.0,,True
1862,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_24,matharena,incorrect,matharena,0.0,,True
1863,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_30,matharena,incorrect,matharena,0.0,,True
1864,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_4,matharena,incorrect,matharena,0.0,,True
1865,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_6,matharena,detected,matharena,0.0,,False
1866,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_29,matharena,incorrect,matharena,0.0,,True
1867,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_8,matharena,corrected,matharena,0.0,,False
1868,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_5,matharena,detected,matharena,0.0,,False
1869,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_1,matharena,correct,matharena,0.0,,False
1870,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_11,matharena,incorrect,matharena,0.0,,True
1871,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_12,matharena,correct,matharena,0.0,,False
1872,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_7,matharena,detected,matharena,0.0,,False
1873,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_13,matharena,incorrect,matharena,0.0,,True
1874,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_14,matharena,correct,matharena,0.0,,False
1875,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_17,matharena,corrected,matharena,0.0,,False
1876,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_15,matharena,detected,matharena,0.0,,False
1877,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_16,matharena,correct,matharena,0.0,,False
1878,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_hmmt_hmmt_feb_2025_9,matharena,incorrect,matharena,0.0,,True
1879,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_2,matharena,correct,matharena,0.0,,False
1880,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_18,matharena,correct,matharena,0.0,,False
1881,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_19,matharena,incorrect,matharena,0.0,,True
1882,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_10,matharena,detected,matharena,0.0,,False
1883,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_21,matharena,correct,matharena,0.0,,False
1884,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.0,,True
1885,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_23,matharena,correct,matharena,0.0,,False
1886,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.0,,True
1887,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.0,,True
1888,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_25,matharena,correct,matharena,0.0,,False
1889,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_26,matharena,detected,matharena,0.0,,False
1890,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_24,matharena,incorrect,matharena,0.0,,True
1891,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_29,matharena,correct,matharena,0.0,,False
1892,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_30,matharena,detected,matharena,0.0,,False
1893,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_3,matharena,correct,matharena,0.0,,False
1894,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_32,matharena,correct,matharena,0.0,,False
1895,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_31,matharena,correct,matharena,0.0,,False
1896,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_33,matharena,incorrect,matharena,0.0,,True
1897,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.0,,True
1898,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_28,matharena,incorrect,matharena,0.0,,True
1899,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_38,matharena,incorrect,matharena,0.0,,True
1900,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_36,matharena,detected,matharena,0.0,,False
1901,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_41,matharena,incorrect,matharena,0.0,,True
1902,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_44,matharena,correct,matharena,0.0,,False
1903,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_35,matharena,correct,matharena,0.0,,False
1904,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_37,matharena,incorrect,matharena,0.0,,True
1905,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_40,matharena,incorrect,matharena,0.0,,True
1906,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.0,,True
1907,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.0,,True
1908,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_46,matharena,incorrect,matharena,0.0,,True
1909,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_45,matharena,correct,matharena,0.0,,False
1910,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_47,matharena,detected,matharena,0.0,,False
1911,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_48,matharena,correct,matharena,0.0,,False
1912,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_49,matharena,detected,matharena,0.0,,False
1913,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_50,matharena,correct,matharena,0.0,,False
1914,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_4,matharena,correct,matharena,0.0,,False
1915,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_5,matharena,correct,matharena,0.0,,False
1916,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_39,matharena,detected,matharena,0.0,,False
1917,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_51,matharena,detected,matharena,0.0,,False
1918,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_53,matharena,incorrect,matharena,0.0,,True
1919,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,nordic_2025_1,nordic,correct,generic,0.0,,False
1920,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_7,matharena,correct,matharena,0.0,,False
1921,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_6,matharena,detected,matharena,0.0,,False
1922,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_8,matharena,detected,matharena,0.0,,False
1923,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_9,matharena,correct,matharena,0.0,,False
1924,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,matharena_smt_smt_2025_52,matharena,correct,matharena,0.0,,False
1925,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_2,pan,correct,generic,0.0,,False
1926,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,nordic_2025_2,nordic,incorrect,generic,0.0,,True
1927,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_1,philippines,incorrect,generic,0.0,,True
1928,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_1,pan,incorrect,generic,0.0,,True
1929,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,nordic_2025_3,nordic,incorrect,generic,0.0,,True
1930,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_5,pan,correct,generic,0.0,,False
1931,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_3,philippines,detected,generic,0.0,,False
1932,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_3,pan,corrected,generic,0.0,,False
1933,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_2,philippines,incorrect,generic,0.0,,True
1934,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_6,pan,incorrect,generic,0.0,,True
1935,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_4,philippines,incorrect,generic,0.0,,True
1936,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_6,philippines,incorrect,generic,0.0,,True
1937,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,pan_african_2025_4,pan,detected,generic,0.0,,False
1938,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_8,philippines,incorrect,generic,0.0,,True
1939,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_5,philippines,incorrect,generic,0.0,,True
1940,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_1,polish,incorrect,generic,0.0,,True
1941,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,philippines_2025_7,philippines,incorrect,generic,0.0,,True
1942,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_5,polish,incorrect,generic,0.0,,True
1943,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_4,polish,incorrect,generic,0.0,,True
1944,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_6,polish,incorrect,generic,0.0,,True
1945,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_2,polish,incorrect,generic,0.0,,True
1946,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_4,rmm,incorrect,generic,0.0,,True
1947,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_2,rmm,incorrect,generic,0.0,,True
1948,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_5,rmm,incorrect,generic,0.0,,True
1949,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,polish_2025_3,polish,incorrect,generic,0.0,,True
1950,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_1,rmm,incorrect,generic,0.0,,True
1951,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_6,rmm,detected,generic,0.0,,False
1952,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_10_2025_1,romania,incorrect,generic,0.0,,True
1953,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,rmm_2025_3,rmm,incorrect,generic,0.0,,True
1954,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_11_2025_2,romania,corrected,generic,0.0,,False
1955,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_10_2025_3,romania,incorrect,generic,0.0,,True
1956,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_10_2025_2,romania,incorrect,generic,0.0,,True
1957,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_11_2025_1,romania,detected,generic,0.0,,False
1958,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_11_2025_3,romania,incorrect,generic,0.0,,True
1959,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_9_2025_1,romania,incorrect,generic,0.0,,True
1960,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_tst_2025_1,romania,incorrect,generic,0.0,,True
1961,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_tst_2025_2,romania,incorrect,generic,0.0,,True
1962,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_12_2025_2,romania,correct,generic,0.0,,False
1963,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_12_2025_1,romania,incorrect,generic,0.0,,True
1964,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_tst_2025_3,romania,incorrect,generic,0.0,,True
1965,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_12_2025_3,romania,detected,generic,0.0,,False
1966,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,romania_9_2025_2,romania,incorrect,generic,0.0,,True
1967,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,serbia_tst_bmo_2025_2,serbia,incorrect,generic,0.0,,True
1968,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,serbia_tst_bmo_2025_1,serbia,incorrect,generic,0.0,,True
1969,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,serbia_tst_bmo_2025_3,serbia,detected,generic,0.0,,False
1970,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,spain_2025_5,spain,correct,generic,0.0,,False
1971,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,spain_2025_4,spain,incorrect,generic,0.0,,True
1972,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,serbia_tst_bmo_2025_4,serbia,corrected,generic,0.0,,False
1973,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,spain_2025_1,spain,incorrect,generic,0.0,,True
1974,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_1,thai,incorrect,generic,0.0,,True
1975,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,spain_2025_3,spain,detected,generic,0.0,,False
1976,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_3,thai,detected,generic,0.0,,False
1977,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_4,thai,incorrect,generic,0.0,,True
1978,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,spain_2025_2,spain,detected,generic,0.0,,False
1979,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_10,thai,incorrect,generic,0.0,,True
1980,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_7,thai,incorrect,generic,0.0,,True
1981,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_1,turkey,detected,generic,0.0,,False
1982,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_8,thai,incorrect,generic,0.0,,True
1983,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_5,thai,incorrect,generic,0.0,,True
1984,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_6,thai,incorrect,generic,0.0,,True
1985,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_2,thai,incorrect,generic,0.0,,True
1986,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_2,turkey,incorrect,generic,0.0,,True
1987,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_7,turkey,incorrect,generic,0.0,,True
1988,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,thai_2025_9,thai,incorrect,generic,0.0,,True
1989,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_4,turkey,incorrect,generic,0.0,,True
1990,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_3,turkey,incorrect,generic,0.0,,True
1991,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_5,turkey,incorrect,generic,0.0,,True
1992,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_6,turkey,incorrect,generic,0.0,,True
1993,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_9,turkey,incorrect,generic,0.0,,True
1994,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,turkey_tst_2025_8,turkey,incorrect,generic,0.0,,True
1995,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_1,usamo,incorrect,generic,0.0,,True
1996,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_3,usamo,incorrect,generic,0.0,,True
1997,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_2,usamo,incorrect,generic,0.0,,True
1998,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_6,usamo,incorrect,generic,0.0,,True
1999,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_4,usamo,incorrect,generic,0.0,,True
2000,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_5,usatst,incorrect,generic,0.0,,True
2001,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_1,usatst,incorrect,generic,0.0,,True
2002,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_3,usatst,incorrect,generic,0.0,,True
2003,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_4,usatst,incorrect,generic,0.0,,True
2004,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_7,usatst,correct,generic,0.0,,False
2005,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_2,usatst,incorrect,generic,0.0,,True
2006,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_6,usatst,incorrect,generic,0.0,,True
2007,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_3,vietnam,incorrect,generic,0.0,,True
2008,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_4,vietnam,incorrect,generic,0.0,,True
2009,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_2,vietnam,detected,generic,0.0,,False
2010,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_8,usatst,incorrect,generic,0.0,,True
2011,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_5,vietnam,incorrect,generic,0.0,,True
2012,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_6,vietnam,correct,generic,0.0,,False
2013,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usamo_2025_5,usamo,incorrect,generic,0.0,,True
2014,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,usatst_2025_9,usatst,incorrect,generic,0.0,,True
2015,GPT-5-mini (medium),Qwen3-235B-A22B,qwen/qwen3_235b_a22b,vietnam_2025_1,vietnam,incorrect,generic,0.0,,True
2016,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_12,allrussian,incorrect,generic,0.06454859,,True
2017,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_14,allrussian,correct,generic,0.04870824,,False
2018,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_13,allrussian,detected,generic,0.0225003999999999,,False
2019,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_16,allrussian,incorrect,generic,0.05444268,,True
2020,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_1,allrussian,incorrect,generic,0.0986418,,True
2021,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_11,allrussian,incorrect,generic,0.1011237099999999,,True
2022,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_2,allrussian,incorrect,generic,0.07476042,,True
2023,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_5,allrussian,correct,generic,0.0400699199999999,,False
2024,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_3,allrussian,correct,generic,0.05787928,,False
2025,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_15,allrussian,incorrect,generic,0.03389854,,True
2026,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_8,allrussian,incorrect,generic,0.0856799199999999,,True
2027,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_6,allrussian,incorrect,generic,0.08927023,,True
2028,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_7,allrussian,incorrect,generic,0.05674328,,True
2029,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_9,allrussian,corrected,generic,0.03716521,,False
2030,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmo_2025_1,bmo,incorrect,generic,0.0705322,,True
2031,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmo_2025_2,bmo,incorrect,generic,0.1169886899999999,,True
2032,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmo_2025_3,bmo,incorrect,generic,0.09040789,,True
2033,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_4,allrussian,incorrect,generic,0.0835050899999999,,True
2034,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmo_2025_4,bmo,incorrect,generic,0.10001768,,True
2035,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_1,bmosl,correct,generic,0.04426744,,False
2036,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_3,bmosl,incorrect,generic,0.0787242999999999,,True
2037,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_4,bmosl,incorrect,generic,0.0540594099999999,,True
2038,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_6,bmosl,incorrect,generic,0.08811453,,True
2039,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_5,bmosl,correct,generic,0.0731234499999999,,False
2040,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_2,bmosl,incorrect,generic,0.0882542299999999,,True
2041,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_4,bmosl,incorrect,generic,0.03912814,,True
2042,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_A_2025_2,bmosl,incorrect,generic,0.10754955,,True
2043,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_1,bmosl,incorrect,generic,0.05601238,,True
2044,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,allrussian_2025_10,allrussian,incorrect,generic,0.08161671,,True
2045,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_1,bmosl,incorrect,generic,0.07767764,,True
2046,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_3,bmosl,incorrect,generic,0.12109662,,True
2047,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_6,bmosl,incorrect,generic,0.06338556,,True
2048,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_C_2025_5,bmosl,incorrect,generic,0.07187642,,True
2049,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_6,bmosl,correct,generic,0.08158078,,False
2050,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_2,bmosl,incorrect,generic,0.04914295,,True
2051,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_7,bmosl,correct,generic,0.0855921399999999,,False
2052,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_4,bmosl,incorrect,generic,0.0444498,,True
2053,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_1,bmosl,corrected,generic,0.09194322,,False
2054,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_3,bmosl,correct,generic,0.04611961,,False
2055,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_3,bmosl,incorrect,generic,0.0985815,,True
2056,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_2,bmosl,correct,generic,0.07754977,,False
2057,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_7,bmosl,correct,generic,0.0287117099999999,,False
2058,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_G_2025_5,bmosl,corrected,generic,0.03712261,,False
2059,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_6,bmosl,incorrect,generic,0.0488078,,True
2060,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_5,bmosl,incorrect,generic,0.06082641,,True
2061,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_2,bulgaria,incorrect,generic,0.0710294799999999,,True
2062,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_1,bulgaria,corrected,generic,0.03518814,,False
2063,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bmosl_NT_2025_4,bmosl,incorrect,generic,0.0952413,,True
2064,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_4,bulgaria,incorrect,generic,0.08014732,,True
2065,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,canada_2025_1,canada,incorrect,generic,0.03176714,,True
2066,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,canada_2025_2,canada,incorrect,generic,0.0382432,,True
2067,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_3,bulgaria,incorrect,generic,0.05602052,,True
2068,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_6,bulgaria,incorrect,generic,0.0730807899999999,,True
2069,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,canada_2025_5,canada,detected,generic,0.06573175,,False
2070,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,china_2025_2,china,incorrect,generic,0.0543366199999999,,True
2071,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,canada_2025_3,canada,incorrect,generic,0.08107106,,True
2072,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,canada_2025_4,canada,incorrect,generic,0.05892678,,True
2073,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,china_2025_1,china,incorrect,generic,0.05227478,,True
2074,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,china_2025_5,china,incorrect,generic,0.05183496,,True
2075,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_10,chinatst,incorrect,generic,0.06352384,,True
2076,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_12,chinatst,incorrect,generic,0.09008443,,True
2077,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,bulgaria_2025_5,bulgaria,incorrect,generic,0.01949951,,True
2078,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,china_2025_3,china,incorrect,generic,0.0704009199999999,,True
2079,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_11,chinatst,correct,generic,0.05933392,,False
2080,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_13,chinatst,incorrect,generic,0.09371925,,True
2081,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_15,chinatst,incorrect,generic,0.06021108,,True
2082,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,china_2025_6,china,incorrect,generic,0.0726939299999999,,True
2083,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_16,chinatst,incorrect,generic,0.0980905399999999,,True
2084,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_14,chinatst,incorrect,generic,0.06209231,,True
2085,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_2,chinatst,correct,generic,0.06580387,,False
2086,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_18,chinatst,incorrect,generic,0.0698129,,True
2087,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_19,chinatst,incorrect,generic,0.05375629,,True
2088,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_21,chinatst,incorrect,generic,0.0694267699999999,,True
2089,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_17,chinatst,incorrect,generic,0.07812058,,True
2090,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_7,chinatst,incorrect,generic,0.06109632,,True
2091,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_22,chinatst,incorrect,generic,0.09194592,,True
2092,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_20,chinatst,incorrect,generic,0.08532523,,True
2093,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_5,chinatst,incorrect,generic,0.0966532699999999,,True
2094,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_8,chinatst,incorrect,generic,0.08278146,,True
2095,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_1,egmo,incorrect,generic,0.07206849,,True
2096,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_3,chinatst,incorrect,generic,0.09763234,,True
2097,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_6,chinatst,incorrect,generic,0.0820202999999999,,True
2098,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_9,chinatst,incorrect,generic,0.07559775,,True
2099,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_6,egmo,incorrect,generic,0.1264740899999999,,True
2100,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_1,elmosl,correct,generic,0.0851409899999999,,False
2101,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_5,egmo,incorrect,generic,0.07655259,,True
2102,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_2,elmosl,correct,generic,0.03794257,,False
2103,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_2,egmo,incorrect,generic,0.04039062,,True
2104,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,chinatst_2025_4,chinatst,incorrect,generic,0.11148179,,True
2105,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_3,elmosl,correct,generic,0.0408225199999999,,False
2106,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_6,elmosl,correct,generic,0.04067977,,False
2107,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_3,elmosl,correct,generic,0.03480557,,False
2108,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_4,egmo,incorrect,generic,0.05542885,,True
2109,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_7,elmosl,incorrect,generic,0.0666356799999999,,True
2110,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_8,elmosl,incorrect,generic,0.0863339299999999,,True
2111,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_2,elmosl,incorrect,generic,0.06232671,,True
2112,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_1,elmosl,incorrect,generic,0.01345287,,True
2113,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,egmo_2025_3,egmo,incorrect,generic,0.0627812599999999,,True
2114,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_4,elmosl,incorrect,generic,0.0610476799999999,,True
2115,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_7,elmosl,incorrect,generic,0.0404059099999999,,True
2116,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_5,elmosl,incorrect,generic,0.04882195,,True
2117,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_2,elmosl,incorrect,generic,0.07671171,,True
2118,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_8,elmosl,incorrect,generic,0.09530376,,True
2119,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_A_2025_5,elmosl,incorrect,generic,0.04658603,,True
2120,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_6,elmosl,correct,generic,0.10719648,,False
2121,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_3,elmosl,incorrect,generic,0.0708983599999999,,True
2122,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_1,elmosl,incorrect,generic,0.04050331,,True
2123,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_5,elmosl,incorrect,generic,0.0554205199999999,,True
2124,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_4,elmosl,incorrect,generic,0.07502817,,True
2125,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_8,elmosl,incorrect,generic,0.04518721,,True
2126,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_9,elmosl,incorrect,generic,0.02973083,,True
2127,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_9,elmosl,incorrect,generic,0.04842621,,True
2128,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_G_2025_7,elmosl,incorrect,generic,0.00857376,,True
2129,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0691686599999999,,True
2130,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_1,elmosl,incorrect,generic,0.09029449,,True
2131,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_C_2025_6,elmosl,incorrect,generic,0.08452514,,True
2132,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_3,elmosl,incorrect,generic,0.03424534,,True
2133,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,german_2025_2,german,incorrect,generic,0.0597436099999999,,True
2134,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_7,elmosl,incorrect,generic,0.06351786,,True
2135,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_5,elmosl,incorrect,generic,0.03379319,,True
2136,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,german_2025_3,german,detected,generic,0.04078737,,False
2137,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,german_2025_4,german,incorrect,generic,0.07136795,,True
2138,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,german_2025_1,german,incorrect,generic,0.028328,,True
2139,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_1,imosl,correct,generic,0.0756998599999999,,False
2140,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,greece_2025_2,greece,incorrect,generic,0.09818753,,True
2141,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,elmosl_NT_2025_6,elmosl,incorrect,generic,0.03820552,,True
2142,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,greece_2025_1,greece,incorrect,generic,0.0453344299999999,,True
2143,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,greece_2025_3,greece,incorrect,generic,0.0534258799999999,,True
2144,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_11,imosl,incorrect,generic,0.0781007099999999,,True
2145,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_10,imosl,incorrect,generic,0.07554807,,True
2146,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,greece_2025_4,greece,detected,generic,0.0883552099999999,,False
2147,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_13,imosl,incorrect,generic,0.0751919799999999,,True
2148,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_17,imosl,incorrect,generic,0.04712818,,True
2149,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_14,imosl,incorrect,generic,0.07452565,,True
2150,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_15,imosl,incorrect,generic,0.03060243,,True
2151,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_18,imosl,incorrect,generic,0.0624461899999999,,True
2152,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_16,imosl,incorrect,generic,0.04536137,,True
2153,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_12,imosl,incorrect,generic,0.0758753599999999,,True
2154,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_19,imosl,incorrect,generic,0.06080639,,True
2155,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_2,imosl,incorrect,generic,0.07087433,,True
2156,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_20,imosl,correct,generic,0.0756583499999999,,False
2157,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_23,imosl,incorrect,generic,0.00703815,,True
2158,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_25,imosl,incorrect,generic,0.0819489899999999,,True
2159,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_24,imosl,incorrect,generic,0.04589642,,True
2160,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_21,imosl,correct,generic,0.06702933,,False
2161,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_22,imosl,incorrect,generic,0.0540053999999999,,True
2162,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_28,imosl,incorrect,generic,0.09029728,,True
2163,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_26,imosl,correct,generic,0.12540847,,False
2164,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_29,imosl,incorrect,generic,0.07285417,,True
2165,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_27,imosl,correct,generic,0.06121641,,False
2166,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_30,imosl,incorrect,generic,0.1245642999999999,,True
2167,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_3,imosl,correct,generic,0.04546207,,False
2168,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_31,imosl,incorrect,generic,0.06992496,,True
2169,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_33,imosl,incorrect,generic,0.0631612,,True
2170,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_34,imosl,incorrect,generic,0.0426293899999999,,True
2171,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_35,imosl,incorrect,generic,0.01560472,,True
2172,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_37,imosl,incorrect,generic,0.11734001,,True
2173,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_36,imosl,incorrect,generic,0.07945252,,True
2174,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_7,imosl,incorrect,generic,0.07180612,,True
2175,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_32,imosl,incorrect,generic,0.08573633,,True
2176,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_8,imosl,correct,generic,0.09522998,,False
2177,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_5,imosl,correct,generic,0.0641883,,False
2178,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_4,imosl,incorrect,generic,0.07762556,,True
2179,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_9,imosl,incorrect,generic,0.05645263,,True
2180,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_2,india,incorrect,generic,0.02803572,,True
2181,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_3,india,incorrect,generic,0.05127873,,True
2182,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_4,india,correct,generic,0.05461634,,False
2183,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_1,india,incorrect,generic,0.0510514699999999,,True
2184,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_6,india,incorrect,generic,0.07191014,,True
2185,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_5,india,detected,generic,0.08052235,,False
2186,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_10,india,correct,generic,0.1367922199999999,,False
2187,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_11,india,incorrect,generic,0.06345467,,True
2188,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_12,india,correct,generic,0.04808126,,False
2189,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,imosl_2025_6,imosl,incorrect,generic,0.06484054,,True
2190,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_14,india,incorrect,generic,0.05835417,,True
2191,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_16,india,correct,generic,0.08064982,,False
2192,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_15,india,incorrect,generic,0.07055301,,True
2193,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_18,india,incorrect,generic,0.04021599,,True
2194,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_2025_1,india,incorrect,generic,0.06987014,,True
2195,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_17,india,incorrect,generic,0.08633753,,True
2196,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_13,india,incorrect,generic,0.0870501599999999,,True
2197,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_20,india,incorrect,generic,0.06244004,,True
2198,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_2,india,detected,generic,0.12469251,,False
2199,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_3,india,detected,generic,0.06341269,,False
2200,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_6,india,detected,generic,0.06480896,,False
2201,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_7,india,incorrect,generic,0.08332023,,True
2202,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_21,india,incorrect,generic,0.0946288399999999,,True
2203,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_1,iran,incorrect,generic,0.06095851,,True
2204,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_8,india,incorrect,generic,0.06395337,,True
2205,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_2,iran,incorrect,generic,0.07059914,,True
2206,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_5,india,incorrect,generic,0.06737911,,True
2207,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_19,india,incorrect,generic,0.0984828899999999,,True
2208,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,india_prep_2025_9,india,incorrect,generic,0.06779785,,True
2209,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_6,iran,detected,generic,0.0616051299999999,,False
2210,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_4,iran,incorrect,generic,0.0798150199999999,,True
2211,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_3,iran,incorrect,generic,0.0612295199999999,,True
2212,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_5,iran,detected,generic,0.09039656,,False
2213,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_7,iran,incorrect,generic,0.0520158299999999,,True
2214,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_9,iran,incorrect,generic,0.0667188,,True
2215,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,iran_tst_2025_8,iran,incorrect,generic,0.04834129,,True
2216,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_1,israel,incorrect,generic,0.03233184,,True
2217,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_10,israel,incorrect,generic,0.07177437,,True
2218,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_7,israel,incorrect,generic,0.04231766,,True
2219,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_6,israel,incorrect,generic,0.05245162,,True
2220,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_5,israel,incorrect,generic,0.0741571999999999,,True
2221,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_2,israel,incorrect,generic,0.06434324,,True
2222,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_9,israel,incorrect,generic,0.09563226,,True
2223,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_3,israel,correct,generic,0.0356397999999999,,False
2224,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_3,izho,incorrect,generic,0.0757568399999999,,True
2225,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_2,izho,incorrect,generic,0.0650276,,True
2226,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_4,israel,incorrect,generic,0.0621903699999999,,True
2227,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_5,izho,incorrect,generic,0.08836435,,True
2228,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,jbmo_2025_1,jbmo,detected,generic,0.04071136,,False
2229,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,israel_tst_2025_8,israel,detected,generic,0.07327229,,False
2230,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_4,izho,incorrect,generic,0.07120117,,True
2231,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,jbmo_2025_4,jbmo,incorrect,generic,0.0785401699999999,,True
2232,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_6,izho,incorrect,generic,0.05482068,,True
2233,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,jbmo_2025_3,jbmo,incorrect,generic,0.0542253199999999,,True
2234,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,izho_2025_1,izho,incorrect,generic,0.01360782,,True
2235,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,korea_2025_1,korea,correct,generic,0.0713401999999999,,False
2236,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,korea_2025_2,korea,correct,generic,0.05811,,False
2237,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,korea_2025_6,korea,incorrect,generic,0.07603292,,True
2238,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_1,matharena,incorrect,matharena,0.03963046,,True
2239,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,korea_2025_5,korea,incorrect,generic,0.0780811199999999,,True
2240,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,korea_2025_3,korea,incorrect,generic,0.06238866,,True
2241,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_11,matharena,incorrect,matharena,0.11884652,,True
2242,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_10,matharena,incorrect,matharena,0.08764958,,True
2243,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_16,matharena,correct,matharena,0.0279738,,False
2244,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_17,matharena,incorrect,matharena,0.0202621199999999,,True
2245,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_19,matharena,correct,matharena,0.05459783,,False
2246,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_12,matharena,correct,matharena,0.1203557,,False
2247,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_13,matharena,incorrect,matharena,0.09974553,,True
2248,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_2,matharena,correct,matharena,0.09439773,,False
2249,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_18,matharena,incorrect,matharena,0.0769991,,True
2250,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_20,matharena,correct,matharena,0.0853076899999999,,False
2251,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_22,matharena,incorrect,matharena,0.08591291,,True
2252,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_21,matharena,correct,matharena,0.09937567,,False
2253,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_15,matharena,incorrect,matharena,0.0783442999999999,,True
2254,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_23,matharena,correct,matharena,0.11782118,,False
2255,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_25,matharena,correct,matharena,0.0568110999999999,,False
2256,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_27,matharena,detected,matharena,0.06701308,,False
2257,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_24,matharena,correct,matharena,0.05015903,,False
2258,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_29,matharena,correct,matharena,0.0691184,,False
2259,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_26,matharena,incorrect,matharena,0.07395439,,True
2260,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_5,matharena,correct,matharena,0.05059755,,False
2261,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_28,matharena,corrected,matharena,0.06490386,,False
2262,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_3,matharena,incorrect,matharena,0.0473652,,True
2263,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_4,matharena,incorrect,matharena,0.03315251,,True
2264,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_6,matharena,correct,matharena,0.02837232,,False
2265,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_7,matharena,incorrect,matharena,0.09301963,,True
2266,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_30,matharena,incorrect,matharena,0.0835351499999999,,True
2267,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_11,matharena,incorrect,matharena,0.0735465799999999,,True
2268,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_1,matharena,correct,matharena,0.0323011999999999,,False
2269,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_8,matharena,correct,matharena,0.04444369,,False
2270,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_12,matharena,incorrect,matharena,0.1016301199999999,,True
2271,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_13,matharena,correct,matharena,0.1146646899999999,,False
2272,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_15,matharena,correct,matharena,0.0558722199999999,,False
2273,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_10,matharena,incorrect,matharena,0.0586386799999999,,True
2274,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_17,matharena,incorrect,matharena,0.09899432,,True
2275,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_14,matharena,correct,matharena,0.05116431,,False
2276,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.0261926299999999,,False
2277,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_aime_aime_2025_9,matharena,incorrect,matharena,0.0962518899999999,,True
2278,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_18,matharena,correct,matharena,0.0297153699999999,,False
2279,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_19,matharena,incorrect,matharena,0.0551707399999999,,True
2280,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_21,matharena,incorrect,matharena,0.0922033799999999,,True
2281,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.1015637299999999,,True
2282,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_23,matharena,incorrect,matharena,0.02161614,,True
2283,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_2,matharena,incorrect,matharena,0.0185556,,True
2284,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_24,matharena,incorrect,matharena,0.08801867,,True
2285,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_26,matharena,incorrect,matharena,0.03214458,,True
2286,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_25,matharena,incorrect,matharena,0.07796978,,True
2287,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_29,matharena,correct,matharena,0.04642779,,False
2288,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_27,matharena,incorrect,matharena,0.0473442599999999,,True
2289,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_30,matharena,incorrect,matharena,0.07848632,,True
2290,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_3,matharena,corrected,matharena,0.08732559,,False
2291,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_5,matharena,incorrect,matharena,0.0536552599999999,,True
2292,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_28,matharena,incorrect,matharena,0.09724423,,True
2293,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_4,matharena,incorrect,matharena,0.02229608,,True
2294,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.01943002,,False
2295,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_8,matharena,correct,matharena,0.06126781,,False
2296,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_10,matharena,correct,matharena,0.06080247,,False
2297,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_14,matharena,incorrect,matharena,0.0874882199999999,,True
2298,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_9,matharena,correct,matharena,0.03097448,,False
2299,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_12,matharena,detected,matharena,0.03272992,,False
2300,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_brumo_brumo_2025_6,matharena,incorrect,matharena,0.07518699,,True
2301,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_1,matharena,incorrect,matharena,0.06527337,,True
2302,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_16,matharena,incorrect,matharena,0.12726668,,True
2303,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_17,matharena,correct,matharena,0.03584796,,False
2304,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_2,matharena,incorrect,matharena,0.04592296,,True
2305,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_13,matharena,incorrect,matharena,0.04667531,,True
2306,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_15,matharena,incorrect,matharena,0.07462519,,True
2307,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_11,matharena,incorrect,matharena,0.05555362,,True
2308,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_19,matharena,correct,matharena,0.10390216,,False
2309,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.05522986,,True
2310,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_23,matharena,correct,matharena,0.08006853,,False
2311,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_28,matharena,incorrect,matharena,0.12168233,,True
2312,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_22,matharena,incorrect,matharena,0.12455762,,True
2313,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_20,matharena,incorrect,matharena,0.06148794,,True
2314,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_25,matharena,incorrect,matharena,0.13285023,,True
2315,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_24,matharena,detected,matharena,0.0628895,,False
2316,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_26,matharena,correct,matharena,0.0542418099999999,,False
2317,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_21,matharena,incorrect,matharena,0.03957963,,True
2318,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_3,matharena,correct,matharena,0.0650722599999999,,False
2319,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_31,matharena,incorrect,matharena,0.04168647,,True
2320,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0741650599999999,,True
2321,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.02510498,,True
2322,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_29,matharena,incorrect,matharena,0.1113909,,True
2323,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_33,matharena,incorrect,matharena,0.06645537,,True
2324,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_36,matharena,incorrect,matharena,0.0800977,,True
2325,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_38,matharena,incorrect,matharena,0.13686556,,True
2326,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_4,matharena,correct,matharena,0.0426910899999999,,False
2327,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_37,matharena,incorrect,matharena,0.06726301,,True
2328,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_35,matharena,incorrect,matharena,0.06319547,,True
2329,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_5,matharena,incorrect,matharena,0.10991195,,True
2330,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_9,matharena,correct,matharena,0.10222118,,False
2331,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_40,matharena,incorrect,matharena,0.07727084,,True
2332,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_12,matharena,incorrect,matharena,0.06335049,,True
2333,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_7,matharena,incorrect,matharena,0.0924107699999999,,True
2334,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_8,matharena,detected,matharena,0.0791596,,False
2335,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_11,matharena,incorrect,matharena,0.08650805,,True
2336,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_13,matharena,corrected,matharena,0.07259562,,False
2337,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.06658511,,True
2338,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_1,matharena,incorrect,matharena,0.0662958999999999,,True
2339,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_14,matharena,incorrect,matharena,0.0624565299999999,,True
2340,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_16,matharena,incorrect,matharena,0.13074496,,True
2341,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_2,matharena,incorrect,matharena,0.0812756499999999,,True
2342,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.064814,,True
2343,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_18,matharena,incorrect,matharena,0.11867718,,True
2344,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_cmimc_cmimc_2025_6,matharena,incorrect,matharena,0.1167689299999999,,True
2345,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.10984352,,True
2346,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_22,matharena,incorrect,matharena,0.0727028699999999,,True
2347,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.04512201,,False
2348,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.08035536,,True
2349,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_27,matharena,incorrect,matharena,0.06535049,,True
2350,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0392889499999999,,False
2351,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_24,matharena,corrected,matharena,0.0939471799999999,,False
2352,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_28,matharena,incorrect,matharena,0.04565533,,True
2353,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_29,matharena,incorrect,matharena,0.08426661,,True
2354,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_30,matharena,incorrect,matharena,0.08579518,,True
2355,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_4,matharena,correct,matharena,0.0328393299999999,,False
2356,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_6,matharena,incorrect,matharena,0.11825754,,True
2357,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_5,matharena,incorrect,matharena,0.06486267,,True
2358,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_3,matharena,incorrect,matharena,0.03264839,,True
2359,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_1,matharena,incorrect,matharena,0.035461,,True
2360,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_9,matharena,incorrect,matharena,0.09742601,,True
2361,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_12,matharena,incorrect,matharena,0.04347511,,True
2362,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_7,matharena,detected,matharena,0.0424107699999999,,False
2363,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_10,matharena,incorrect,matharena,0.0838252,,True
2364,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_8,matharena,incorrect,matharena,0.0225316599999999,,True
2365,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_13,matharena,incorrect,matharena,0.0931946499999999,,True
2366,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_14,matharena,incorrect,matharena,0.06425868,,True
2367,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_hmmt_hmmt_feb_2025_26,matharena,incorrect,matharena,0.1098206599999999,,True
2368,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_16,matharena,correct,matharena,0.0424752999999999,,False
2369,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_18,matharena,incorrect,matharena,0.1218588799999999,,True
2370,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_21,matharena,incorrect,matharena,0.05863212,,True
2371,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_2,matharena,correct,matharena,0.02194684,,False
2372,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_17,matharena,correct,matharena,0.07099847,,False
2373,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_15,matharena,incorrect,matharena,0.0332145499999999,,True
2374,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_19,matharena,incorrect,matharena,0.0942795,,True
2375,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_23,matharena,incorrect,matharena,0.05025709,,True
2376,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.06050361,,True
2377,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_28,matharena,correct,matharena,0.02180613,,False
2378,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_25,matharena,incorrect,matharena,0.0333086799999999,,True
2379,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_26,matharena,incorrect,matharena,0.07661967,,True
2380,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_29,matharena,corrected,matharena,0.05254231,,False
2381,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_30,matharena,incorrect,matharena,0.08150754,,True
2382,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.05352997,,True
2383,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_3,matharena,correct,matharena,0.04671099,,False
2384,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.09717632,,True
2385,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_31,matharena,incorrect,matharena,0.03352388,,True
2386,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_32,matharena,corrected,matharena,0.04846661,,False
2387,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_33,matharena,incorrect,matharena,0.0596991499999999,,True
2388,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.09204791,,True
2389,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_35,matharena,correct,matharena,0.05216246,,False
2390,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_4,matharena,correct,matharena,0.01818606,,False
2391,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_36,matharena,incorrect,matharena,0.0926416799999999,,True
2392,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_40,matharena,incorrect,matharena,0.10213612,,True
2393,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.0954067899999999,,True
2394,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_38,matharena,incorrect,matharena,0.1021151,,True
2395,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_39,matharena,correct,matharena,0.04674703,,False
2396,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_41,matharena,incorrect,matharena,0.09124533,,True
2397,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_44,matharena,incorrect,matharena,0.02039363,,True
2398,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_45,matharena,correct,matharena,0.01354337,,False
2399,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.06690822,,True
2400,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_46,matharena,correct,matharena,0.02838596,,False
2401,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_48,matharena,incorrect,matharena,0.0295434399999999,,True
2402,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_47,matharena,incorrect,matharena,0.0493010199999999,,True
2403,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_52,matharena,incorrect,matharena,0.08208323,,True
2404,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_51,matharena,correct,matharena,0.07761409,,False
2405,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_49,matharena,correct,matharena,0.08251673,,False
2406,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_50,matharena,incorrect,matharena,0.03238308,,True
2407,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_5,matharena,incorrect,matharena,0.07440014,,True
2408,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_53,matharena,incorrect,matharena,0.08334912,,True
2409,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,nordic_2025_1,nordic,incorrect,generic,0.06612027,,True
2410,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_6,matharena,incorrect,matharena,0.09701854,,True
2411,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,nordic_2025_3,nordic,incorrect,generic,0.0725030199999999,,True
2412,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_2,pan,correct,generic,0.03630684,,False
2413,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,nordic_2025_2,nordic,incorrect,generic,0.07827251,,True
2414,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_9,matharena,incorrect,matharena,0.05925693,,True
2415,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_5,pan,incorrect,generic,0.10270204,,True
2416,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_1,philippines,incorrect,generic,0.09916733,,True
2417,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_1,pan,incorrect,generic,0.0445948699999999,,True
2418,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_3,pan,incorrect,generic,0.0153916599999999,,True
2419,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_4,pan,correct,generic,0.0390124,,False
2420,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_5,philippines,correct,generic,0.0549168499999999,,False
2421,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,matharena_smt_smt_2025_7,matharena,incorrect,matharena,0.06446074,,True
2422,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_4,philippines,incorrect,generic,0.0074251799999999,,True
2423,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_2,philippines,incorrect,generic,0.0750042,,True
2424,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,pan_african_2025_6,pan,incorrect,generic,0.13617417,,True
2425,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_3,philippines,incorrect,generic,0.051691,,True
2426,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_1,polish,detected,generic,0.0179150699999999,,False
2427,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_8,philippines,incorrect,generic,0.10223887,,True
2428,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_6,philippines,incorrect,generic,0.1319512499999999,,True
2429,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_3,polish,incorrect,generic,0.01582213,,True
2430,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,philippines_2025_7,philippines,incorrect,generic,0.06637294,,True
2431,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_2,polish,incorrect,generic,0.0441731599999999,,True
2432,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_4,polish,incorrect,generic,0.06971245,,True
2433,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_6,polish,detected,generic,0.05657255,,False
2434,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_1,rmm,incorrect,generic,0.0609073,,True
2435,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,polish_2025_5,polish,detected,generic,0.05585278,,False
2436,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_5,rmm,incorrect,generic,0.05974689,,True
2437,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_2,rmm,correct,generic,0.1116416,,False
2438,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_3,rmm,incorrect,generic,0.05388817,,True
2439,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_6,rmm,incorrect,generic,0.0757510399999999,,True
2440,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,rmm_2025_4,rmm,incorrect,generic,0.11848213,,True
2441,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_10_2025_1,romania,incorrect,generic,0.04472629,,True
2442,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_11_2025_2,romania,incorrect,generic,0.04293543,,True
2443,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_11_2025_1,romania,correct,generic,0.07171852,,False
2444,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_10_2025_2,romania,incorrect,generic,0.07319139,,True
2445,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_12_2025_2,romania,incorrect,generic,0.0434045199999999,,True
2446,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_9_2025_1,romania,incorrect,generic,0.0739988799999999,,True
2447,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_12_2025_1,romania,incorrect,generic,0.06558227,,True
2448,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_10_2025_3,romania,incorrect,generic,0.05778295,,True
2449,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_12_2025_3,romania,incorrect,generic,0.07714762,,True
2450,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_tst_2025_3,romania,corrected,generic,0.08273369,,False
2451,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_11_2025_3,romania,correct,generic,0.06177151,,False
2452,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_9_2025_2,romania,incorrect,generic,0.1203184799999999,,True
2453,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_tst_2025_1,romania,incorrect,generic,0.08271618,,True
2454,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,serbia_tst_bmo_2025_2,serbia,incorrect,generic,0.07659117,,True
2455,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,serbia_tst_bmo_2025_1,serbia,detected,generic,0.0836588799999999,,False
2456,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,romania_tst_2025_2,romania,incorrect,generic,0.0772070899999999,,True
2457,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,spain_2025_1,spain,incorrect,generic,0.04389621,,True
2458,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,serbia_tst_bmo_2025_4,serbia,incorrect,generic,0.08563053,,True
2459,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,spain_2025_2,spain,detected,generic,0.06533024,,False
2460,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,spain_2025_3,spain,incorrect,generic,0.08294722,,True
2461,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_1,thai,correct,generic,0.08578819,,False
2462,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_10,thai,detected,generic,0.07610998,,False
2463,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,spain_2025_4,spain,detected,generic,0.0511308,,False
2464,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,serbia_tst_bmo_2025_3,serbia,correct,generic,0.05478827,,False
2465,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_2,thai,correct,generic,0.06235288,,False
2466,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,spain_2025_5,spain,incorrect,generic,0.08307864,,True
2467,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_3,thai,incorrect,generic,0.04030731,,True
2468,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_4,thai,incorrect,generic,0.1097828699999999,,True
2469,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_6,thai,incorrect,generic,0.0641351,,True
2470,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_5,thai,incorrect,generic,0.0692621,,True
2471,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_8,thai,incorrect,generic,0.0627511,,True
2472,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_2,turkey,incorrect,generic,0.0489394399999999,,True
2473,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_9,thai,correct,generic,0.05123764,,False
2474,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,thai_2025_7,thai,correct,generic,0.037825,,False
2475,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_6,turkey,incorrect,generic,0.00699172,,True
2476,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_7,turkey,incorrect,generic,0.0642959199999999,,True
2477,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_3,turkey,incorrect,generic,0.0660179499999999,,True
2478,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_1,turkey,detected,generic,0.0611747699999999,,False
2479,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_9,turkey,detected,generic,0.0313890999999999,,False
2480,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_4,turkey,incorrect,generic,0.06847078,,True
2481,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_8,turkey,incorrect,generic,0.10133896,,True
2482,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,turkey_tst_2025_5,turkey,incorrect,generic,0.09006797,,True
2483,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_2,usamo,incorrect,generic,0.05349231,,True
2484,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_1,usamo,incorrect,generic,0.05561535,,True
2485,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_4,usamo,correct,generic,0.04678972,,False
2486,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_5,usamo,incorrect,generic,0.06964505,,True
2487,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_1,usatst,incorrect,generic,0.0479818,,True
2488,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_3,usamo,incorrect,generic,0.04674023,,True
2489,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_2,usatst,incorrect,generic,0.10346537,,True
2490,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_5,usatst,incorrect,generic,0.07873534,,True
2491,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usamo_2025_6,usamo,detected,generic,0.0560486599999999,,False
2492,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_4,usatst,incorrect,generic,0.12946343,,True
2493,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_7,usatst,incorrect,generic,0.02783861,,True
2494,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_6,usatst,incorrect,generic,0.08168641,,True
2495,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_9,usatst,incorrect,generic,0.0879535999999999,,True
2496,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_4,vietnam,incorrect,generic,0.06458885,,True
2497,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_3,vietnam,incorrect,generic,0.057744,,True
2498,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,usatst_2025_8,usatst,incorrect,generic,0.0690203599999999,,True
2499,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_5,vietnam,incorrect,generic,0.09231748,,True
2500,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_1,vietnam,correct,generic,0.05843241,,False
2501,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_2,vietnam,incorrect,generic,0.11874341,,True
2502,GPT-5-mini (medium),DeepSeek-v3.1 (Think),deepseek/deepseek_v31,vietnam_2025_6,vietnam,detected,generic,0.0821127599999999,,False
