,judge,solver,solver_id,problem,competition,true_grade,split,cost,confidence,entropy,tail_confidence,full_confidence,incorrect
0,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_22,matharena,incorrect,matharena,0.0,,0.3575565,15.951963,13.284146,True
1,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_9_2025_1-part1,romania,detected,generic,0.0,,0.2196096,19.376427,16.332504,False
2,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_5,elmosl,incorrect,generic,0.0,,0.27413073,16.410686,14.945548,True
3,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_1-part1,imosl,correct,generic,0.0,,0.23532459,13.538811,15.558486,False
4,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_48-part1,matharena,correct,matharena,0.0,,0.20579734,19.815918,16.815567,False
5,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_14-part1,matharena,correct,matharena,0.0,,0.20038103,16.581299,16.372782,False
6,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_14-part1,imosl,incorrect,generic,0.0,,0.30480087,15.434358,14.039884,True
7,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_9-part1,turkey,incorrect,generic,0.0,,0.29091224,15.08946,14.242882,True
8,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_1-part1,nordic,correct,generic,0.0,,0.28429508,16.989038,14.782328,False
9,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_51-part1,matharena,incorrect,matharena,0.0,,0.17981136,16.815725,16.346205,True
10,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_1-part1,pan,corrected,generic,0.0,,0.239514,16.844442,15.584284,False
11,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_20,matharena,incorrect,matharena,0.0,,0.24918361,14.369947,15.4696865,True
12,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_4-part1,bulgaria,correct,generic,0.0,,0.22096571,14.777244,16.216404,False
13,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_28-part1,matharena,correct,matharena,0.0,,0.24831945,17.403408,15.729024,False
14,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_7-part1,matharena,detected,matharena,0.0,,0.3224565,13.939843,13.700962,False
15,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_9,chinatst,detected,generic,0.0,,0.27131355,16.518787,14.874209,False
16,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_2,elmosl,incorrect,generic,0.0,,0.26201236,15.927994,15.226321,True
17,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_4-part1,chinatst,incorrect,generic,0.0,,0.20116621,14.928825,16.490181,True
18,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_28,matharena,correct,matharena,0.0,,0.26169804,19.02117,15.828775,False
19,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_3-part1,iran,incorrect,generic,0.0,,0.33460557,16.202217,13.422067,True
20,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_18-part1,india,incorrect,generic,0.0,,0.24192315,14.130251,15.589585,True
21,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_18-part1,imosl,incorrect,generic,0.0,,0.23780303,14.076144,15.9312105,True
22,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_12,matharena,correct,matharena,0.0,,0.30397126,14.474425,14.555869,False
23,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_1-part1,matharena,correct,matharena,0.0,,0.22476801,20.038605,16.459171,False
24,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_32-part1,matharena,incorrect,matharena,0.0,,0.32856297,14.104892,13.401378,True
25,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_3-part1,usamo,incorrect,generic,0.0,,0.33958244,15.450779,13.510349,True
26,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_19,matharena,incorrect,matharena,0.0,,0.23469411,15.289456,15.415831,True
27,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_19-part1,india,incorrect,generic,0.0,,0.28231582,18.030935,15.321267,True
28,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_5-part1,bmosl,incorrect,generic,0.0,,0.2898331,13.7169075,14.078143,True
29,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_15-part1,matharena,incorrect,matharena,0.0,,0.22232518,16.104136,16.14442,True
30,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_5-part1,chinatst,incorrect,generic,0.0,,0.33946094,17.107897,13.763782,True
31,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_2,matharena,correct,matharena,0.0,,0.16859761,21.21933,18.373413,False
32,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_24-part1,matharena,detected,matharena,0.0,,0.27367696,16.100325,14.756872,False
33,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_50,matharena,correct,matharena,0.0,,0.17985256,19.940382,17.68428,False
34,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_2-part1,egmo,incorrect,generic,0.0,,0.26817504,16.97047,14.756033,True
35,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_36,matharena,correct,matharena,0.0,,0.23333287,16.909075,15.874099,False
36,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_4,izho,incorrect,generic,0.0,,0.20152403,18.673016,16.682482,True
37,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_6,rmm,detected,generic,0.0,,0.3386696,15.405348,13.444728,False
38,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_35,matharena,detected,matharena,0.0,,0.20744778,18.093126,16.542536,False
39,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_4-part1,bmo,incorrect,generic,0.0,,0.32709235,15.5571,13.683332,True
40,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_2-part1,elmosl,incorrect,generic,0.0,,0.2357443,14.727221,15.082333,True
41,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_31,matharena,correct,matharena,0.0,,0.27415034,17.953249,15.078192,False
42,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_9,iran,incorrect,generic,0.0,,0.17638081,17.465479,16.935135,True
43,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_1,serbia,incorrect,generic,0.0,,0.25784063,15.080807,15.154226,True
44,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_4,spain,incorrect,generic,0.0,,0.23325127,16.317303,15.829055,True
45,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_19,imosl,incorrect,generic,0.0,,0.15709351,15.466444,18.11374,True
46,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_5,bulgaria,correct,generic,0.0,,0.24526647,17.958494,15.597761,False
47,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_9-part1,matharena,detected,matharena,0.0,,0.21499479,15.815298,15.983826,False
48,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_3-part1,rmm,incorrect,generic,0.0,,0.26613513,17.790358,15.086514,True
49,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_18-part1,matharena,correct,matharena,0.0,,0.24809094,13.426575,15.746846,False
50,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_3,usatst,incorrect,generic,0.0,,0.26115867,15.576478,14.518297,True
51,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_18,matharena,correct,matharena,0.0,,0.31262583,16.97947,14.491249,False
52,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_28-part1,matharena,correct,matharena,0.0,,0.17594534,17.017488,16.854153,False
53,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_6-part1,bmosl,incorrect,generic,0.0,,0.31628442,14.721242,13.894847,True
54,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_4,matharena,correct,matharena,0.0,,0.21259768,19.700474,16.44585,False
55,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_7,elmosl,incorrect,generic,0.0,,0.14762804,19.30797,18.87614,True
56,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_28,imosl,correct,generic,0.0,,0.25261986,14.244091,14.829469,False
57,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_4-part1,elmosl,incorrect,generic,0.0,,0.33429083,15.987315,13.758063,True
58,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_10,imosl,incorrect,generic,0.0,,0.2740639,16.811823,15.003462,True
59,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_11,matharena,incorrect,matharena,0.0,,0.32619324,15.766291,14.278363,True
60,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_9,matharena,detected,matharena,0.0,,0.17885287,17.992662,17.27296,False
61,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_7-part1,imosl,detected,generic,0.0,,0.24491352,13.241534,14.797176,False
62,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_3-part1,elmosl,correct,generic,0.0,,0.26981074,16.892302,15.110625,False
63,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_11,matharena,incorrect,matharena,0.0,,0.2808038,14.053869,15.319664,True
64,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_6-part1,polish,detected,generic,0.0,,0.18489891,17.684988,17.210604,False
65,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_7,elmosl,incorrect,generic,0.0,,0.23936766,17.841757,15.327346,True
66,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_2-part1,elmosl,incorrect,generic,0.0,,0.2082969,16.3,16.032906,True
67,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_47-part1,matharena,correct,matharena,0.0,,0.17038873,17.792274,17.787834,False
68,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_27-part1,matharena,incorrect,matharena,0.0,,0.17410423,16.012106,17.792841,True
69,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_4-part1,greece,correct,generic,0.0,,0.1929457,18.555033,16.96302,False
70,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_22,imosl,incorrect,generic,0.0,,0.29522064,15.239638,14.251994,True
71,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_3,spain,incorrect,generic,0.0,,0.20980513,18.76498,16.16306,True
72,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_9_2025_2-part1,romania,incorrect,generic,0.0,,0.20891923,15.509047,16.071827,True
73,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_34-part1,matharena,incorrect,matharena,0.0,,0.27065527,14.830062,14.475035,True
74,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_4-part1,matharena,incorrect,matharena,0.0,,0.21424885,19.81959,17.255692,True
75,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_13-part1,matharena,incorrect,matharena,0.0,,0.17833917,18.864466,17.795986,True
76,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_7,elmosl,incorrect,generic,0.0,,0.31916812,16.227364,14.351885,True
77,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_9-part1,india,detected,generic,0.0,,0.21380647,16.064974,16.507473,False
78,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_1,elmosl,detected,generic,0.0,,0.23635717,15.597786,15.773426,False
79,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_37,matharena,incorrect,matharena,0.0,,0.20302302,15.911304,15.965657,True
80,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_4,bmosl,detected,generic,0.0,,0.20162863,13.908705,16.250553,False
81,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_14,matharena,correct,matharena,0.0,,0.22150397,18.40101,16.086267,False
82,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_3-part1,greece,detected,generic,0.0,,0.16139819,21.734774,18.135363,False
83,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_28,matharena,incorrect,matharena,0.0,,0.21907638,16.500652,16.048922,True
84,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_7-part1,matharena,incorrect,matharena,0.0,,0.18064572,14.394353,16.475645,True
85,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_33-part1,matharena,incorrect,matharena,0.0,,0.22812541,17.974522,15.36318,True
86,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_4-part1,thai,detected,generic,0.0,,0.19418377,15.721751,17.152424,False
87,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_5-part1,elmosl,correct,generic,0.0,,0.22180021,16.303032,16.19712,False
88,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_1,elmosl,incorrect,generic,0.0,,0.26410156,16.833115,14.831258,True
89,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_1,izho,incorrect,generic,0.0,,0.186839,23.033598,18.128912,True
90,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_8,allrussian,incorrect,generic,0.0,,0.2317466,15.580049,15.927168,True
91,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_3-part1,matharena,incorrect,matharena,0.0,,0.20224902,18.694515,16.59902,True
92,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_4-part1,israel,incorrect,generic,0.0,,0.1963457,19.01289,16.78573,True
93,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_4,matharena,correct,matharena,0.0,,0.22584444,18.882036,16.866074,False
94,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_3-part1,usatst,incorrect,generic,0.0,,0.2319819,14.898022,15.155028,True
95,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_5-part1,thai,incorrect,generic,0.0,,0.25611258,15.975819,15.257828,True
96,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_8-part1,philippines,correct,generic,0.0,,0.23903409,15.781392,15.538718,False
97,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_5,rmm,incorrect,generic,0.0,,0.228337,15.845383,15.97839,True
98,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_41,matharena,incorrect,matharena,0.0,,0.23988543,15.861253,15.528216,True
99,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_27,imosl,correct,generic,0.0,,0.28446722,14.7574005,14.454198,False
100,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_2,greece,incorrect,generic,0.0,,0.21288162,13.828619,16.558165,True
101,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_24,matharena,detected,matharena,0.0,,0.24475108,15.838583,15.46098,False
102,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_8,thai,incorrect,generic,0.0,,0.32447407,14.355332,13.938337,True
103,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_6-part1,imosl,detected,generic,0.0,,0.20703225,16.404749,16.396955,False
104,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_9,thai,incorrect,generic,0.0,,0.2774963,14.709468,14.23434,True
105,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_17,matharena,incorrect,matharena,0.0,,0.21294667,17.466011,15.927559,True
106,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_14,matharena,incorrect,matharena,0.0,,0.2924729,14.326419,15.172117,True
107,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_1-part1,polish,detected,generic,0.0,,0.22775853,18.652775,16.31234,False
108,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_1-part1,thai,detected,generic,0.0,,0.21716483,17.766905,16.724333,False
109,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_27,matharena,incorrect,matharena,0.0,,0.18461414,18.219896,17.408907,True
110,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_21-part1,matharena,detected,matharena,0.0,,0.2446059,16.432636,15.70442,False
111,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_6-part1,india,incorrect,generic,0.0,,0.22596294,15.6350565,15.944543,True
112,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_5-part1,polish,detected,generic,0.0,,0.26280907,15.67835,14.791644,False
113,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_5-part1,bmosl,detected,generic,0.0,,0.23995025,14.844692,15.84263,False
114,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_6-part1,elmosl,detected,generic,0.0,,0.23655756,15.278503,15.525897,False
115,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_39-part1,matharena,incorrect,matharena,0.0,,0.16488074,18.790237,17.685814,True
116,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_2-part1,izho,incorrect,generic,0.0,,0.31486914,14.264834,14.415958,True
117,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_5-part1,usamo,incorrect,generic,0.0,,0.24712025,16.83595,15.652987,True
118,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_1,elmosl,incorrect,generic,0.0,,0.20546351,17.585707,16.477007,True
119,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_16-part1,matharena,incorrect,matharena,0.0,,0.29105994,14.18857,14.265601,True
120,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_3,allrussian,detected,generic,0.0,,0.22930321,15.94887,15.698469,False
121,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_4-part1,turkey,incorrect,generic,0.0,,0.28915295,13.019518,13.751059,True
122,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_3-part1,allrussian,correct,generic,0.0,,0.21135502,18.4154,16.050226,False
123,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_2-part1,thai,incorrect,generic,0.0,,0.29676992,15.979088,14.087192,True
124,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_27,matharena,incorrect,matharena,0.0,,0.20258467,15.658929,16.080004,True
125,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_14-part1,matharena,correct,matharena,0.0,,0.23288436,18.844078,16.571901,False
126,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_19-part1,matharena,correct,matharena,0.0,,0.20848905,18.315948,16.404863,False
127,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_2-part1,romania,incorrect,generic,0.0,,0.30725428,14.705207,13.876161,True
128,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_15-part1,matharena,incorrect,matharena,0.0,,0.2985516,13.343601,14.027316,True
129,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_2,bmo,correct,generic,0.0,,0.24051508,12.840234,16.24409,False
130,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_2,romania,incorrect,generic,0.0,,0.23921181,16.077625,15.391647,True
131,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_29,matharena,incorrect,matharena,0.0,,0.15944073,21.244555,17.921038,True
132,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_24,imosl,incorrect,generic,0.0,,0.28758714,15.666128,14.278044,True
133,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_5,korea,incorrect,generic,0.0,,0.2262009,16.666878,15.577046,True
134,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_14,matharena,correct,matharena,0.0,,0.25055435,15.484561,15.074903,False
135,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_7-part1,chinatst,incorrect,generic,0.0,,0.31857613,14.483426,13.981021,True
136,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_2-part1,iran,incorrect,generic,0.0,,0.30925426,15.237677,13.748416,True
137,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_5,imosl,detected,generic,0.0,,0.22329088,17.273073,15.844422,False
138,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_5,bmosl,incorrect,generic,0.0,,0.26051503,16.472778,14.876192,True
139,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_12-part1,matharena,incorrect,matharena,0.0,,0.21895066,17.297104,16.183899,True
140,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_3-part1,vietnam,incorrect,generic,0.0,,0.18863346,16.178688,17.57279,True
141,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_4,india,incorrect,generic,0.0,,0.27727735,18.139751,14.804178,True
142,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_11,matharena,detected,matharena,0.0,,0.1903675,13.834269,17.626854,False
143,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_8-part1,matharena,detected,matharena,0.0,,0.22475699,16.850693,16.046244,False
144,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_8,israel,incorrect,generic,0.0,,0.2053495,17.816048,16.506365,True
145,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_29-part1,matharena,incorrect,matharena,0.0,,0.17998256,16.055052,17.228046,True
146,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_1-part1,bmosl,incorrect,generic,0.0,,0.17152461,16.360123,17.1162,True
147,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_4,matharena,correct,matharena,0.0,,0.22262307,17.352047,16.644257,False
148,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_24-part1,matharena,incorrect,matharena,0.0,,0.22453998,14.832439,15.09459,True
149,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_52-part1,matharena,correct,matharena,0.0,,0.19210736,17.713062,16.807013,False
150,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_20,matharena,incorrect,matharena,0.0,,0.33664843,14.341179,14.124392,True
151,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_37-part1,imosl,incorrect,generic,0.0,,0.27945578,18.657537,14.591264,True
152,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_9-part1,thai,detected,generic,0.0,,0.27878082,16.400002,14.517558,False
153,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_19-part1,matharena,incorrect,matharena,0.0,,0.22644462,14.288512,15.489028,True
154,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_40,matharena,incorrect,matharena,0.0,,0.25082654,15.55408,15.19389,True
155,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_8-part1,matharena,incorrect,matharena,0.0,,0.18016231,19.180676,17.134737,True
156,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_6,india,incorrect,generic,0.0,,0.19313277,15.263691,17.398617,True
157,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_47,matharena,correct,matharena,0.0,,0.24256791,16.524384,16.015696,False
158,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_37,matharena,incorrect,matharena,0.0,,0.29450622,14.541462,14.154055,True
159,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_16-part1,india,correct,generic,0.0,,0.267866,15.385116,14.717351,False
160,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_16,matharena,correct,matharena,0.0,,0.23547952,19.30785,16.817509,False
161,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_18,matharena,incorrect,matharena,0.0,,0.3437031,14.637068,13.480192,True
162,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_2-part1,elmosl,incorrect,generic,0.0,,0.3598116,13.658456,13.520157,True
163,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_2-part1,greece,incorrect,generic,0.0,,0.21314423,14.965719,16.391748,True
164,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_13-part1,matharena,correct,matharena,0.0,,0.2921636,14.23361,14.527954,False
165,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_9-part1,allrussian,detected,generic,0.0,,0.27406836,15.7095375,14.885696,False
166,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_9-part1,elmosl,incorrect,generic,0.0,,0.20019633,18.273504,15.958582,True
167,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_3,turkey,incorrect,generic,0.0,,0.23180826,18.376238,15.49159,True
168,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_30,matharena,incorrect,matharena,0.0,,0.18842512,17.699203,17.079985,True
169,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_14-part1,india,incorrect,generic,0.0,,0.20931779,17.188448,16.57215,True
170,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_15-part1,matharena,incorrect,matharena,0.0,,0.2889421,15.516671,15.047694,True
171,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_16,matharena,detected,matharena,0.0,,0.030173177,22.457027,19.056421,False
172,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_7,chinatst,incorrect,generic,0.0,,0.25011113,15.866477,15.732943,True
173,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_25,matharena,detected,matharena,0.0,,0.2493135,14.082533,15.626873,False
174,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_2,israel,incorrect,generic,0.0,,0.3481176,14.838363,13.0963955,True
175,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_4,philippines,incorrect,generic,0.0,,0.28910923,12.801685,14.485706,True
176,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_3,elmosl,correct,generic,0.0,,0.25577578,19.935457,15.257048,False
177,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_2,matharena,correct,matharena,0.0,,0.2416443,17.996248,16.350338,False
178,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_15,chinatst,incorrect,generic,0.0,,0.2555787,17.82197,15.612114,True
179,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_5-part1,philippines,correct,generic,0.0,,0.21932961,16.191233,16.22637,False
180,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_1-part1,india,detected,generic,0.0,,0.24362269,19.32026,16.184715,False
181,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_31,imosl,incorrect,generic,0.0,,0.1717842,15.945978,16.172405,True
182,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_2-part1,serbia,incorrect,generic,0.0,,0.25172737,16.698168,15.165658,True
183,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_23-part1,matharena,incorrect,matharena,0.0,,0.19576238,15.0355215,16.362343,True
184,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_35,matharena,correct,matharena,0.0,,0.1526198,17.55038,18.522537,False
185,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_3-part1,matharena,correct,matharena,0.0,,0.25623584,17.915312,15.465867,False
186,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_37-part1,matharena,detected,matharena,0.0,,0.28523982,15.966255,14.486551,False
187,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_32-part1,matharena,correct,matharena,0.0,,0.2007344,16.593761,17.282045,False
188,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_6-part1,matharena,detected,matharena,0.0,,0.20607431,16.521149,16.668274,False
189,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_5-part1,elmosl,incorrect,generic,0.0,,0.37741205,14.353662,12.430062,True
190,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_19,matharena,detected,matharena,0.0,,0.2301786,19.785408,16.136362,False
191,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_1,israel,correct,generic,0.0,,0.18827203,17.303066,16.77329,False
192,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_5-part1,imosl,detected,generic,0.0,,0.20322603,16.96076,16.215477,False
193,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_9-part1,matharena,detected,matharena,0.0,,0.24528547,18.697536,15.44125,False
194,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_3-part1,romania,incorrect,generic,0.0,,0.25165296,17.572369,14.965577,True
195,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_6,elmosl,incorrect,generic,0.0,,0.19538966,15.878151,16.32696,True
196,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_8,elmosl,incorrect,generic,0.0,,0.31473416,15.58425,14.074217,True
197,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_7-part1,thai,correct,generic,0.0,,0.24565022,15.46487,15.63689,False
198,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_1,usatst,incorrect,generic,0.0,,0.32075846,13.53369,14.113783,True
199,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_1-part1,israel,detected,generic,0.0,,0.18541259,15.433942,16.88823,False
200,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_45-part1,matharena,correct,matharena,0.0,,0.28553557,18.098598,15.035165,False
201,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_1-part1,romania,incorrect,generic,0.0,,0.26463798,16.313454,14.333084,True
202,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_14-part1,chinatst,incorrect,generic,0.0,,0.18738772,16.653778,17.064137,True
203,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_12,matharena,correct,matharena,0.0,,0.23935448,17.992907,16.005903,False
204,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_4-part1,jbmo,detected,generic,0.0,,0.26466832,15.639203,14.772798,False
205,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_14,matharena,incorrect,matharena,0.0,,0.20924892,16.017193,16.45071,True
206,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_18,matharena,incorrect,matharena,0.0,,0.21456566,15.265022,16.138666,True
207,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_10-part1,imosl,incorrect,generic,0.0,,0.30207807,16.59954,14.580757,True
208,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_8-part1,allrussian,incorrect,generic,0.0,,0.2293962,16.925982,16.081495,True
209,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_36-part1,matharena,incorrect,matharena,0.0,,0.1529354,16.975336,17.538147,True
210,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_10-part1,allrussian,incorrect,generic,0.0,,0.24185969,18.030405,15.787933,True
211,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_4,matharena,detected,matharena,0.0,,0.24281764,16.70336,15.725373,False
212,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_14,allrussian,detected,generic,0.0,,0.3501906,13.647928,13.45821,False
213,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_15,matharena,correct,matharena,0.0,,0.21900694,16.546026,16.162971,False
214,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_1-part1,iran,incorrect,generic,0.0,,0.20648791,17.421919,16.285776,True
215,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_21,matharena,detected,matharena,0.0,,0.2730632,17.137674,14.936294,False
216,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_20-part1,matharena,incorrect,matharena,0.0,,0.23389576,15.52243,15.997553,True
217,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_15-part1,imosl,incorrect,generic,0.0,,0.19718531,16.227077,16.108337,True
218,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_4-part1,matharena,correct,matharena,0.0,,0.23307014,17.521702,15.731157,False
219,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_8-part1,imosl,incorrect,generic,0.0,,0.23608038,16.6469,15.4873495,True
220,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_53-part1,matharena,incorrect,matharena,0.0,,0.27337086,14.693999,14.815614,True
221,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_19,chinatst,incorrect,generic,0.0,,0.33411023,14.433416,13.792004,True
222,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_10-part1,israel,incorrect,generic,0.0,,0.33379295,15.810808,13.56948,True
223,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_4-part1,serbia,incorrect,generic,0.0,,0.24644999,17.67405,15.405657,True
224,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_1,china,incorrect,generic,0.0,,0.28502542,15.317519,14.288781,True
225,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_27-part1,matharena,detected,matharena,0.0,,0.24358796,18.027493,15.944484,False
226,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_1,bmosl,incorrect,generic,0.0,,0.1415641,20.821993,18.459475,True
227,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_3-part1,bmosl,incorrect,generic,0.0,,0.1996766,17.065828,16.43934,True
228,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_17,matharena,incorrect,matharena,0.0,,0.32195517,13.64367,13.667606,True
229,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_1,german,detected,generic,0.0,,0.25026494,18.08597,15.619486,False
230,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_3,vietnam,incorrect,generic,0.0,,0.24966611,15.182781,15.69171,True
231,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_29-part1,matharena,correct,matharena,0.0,,0.2339899,14.675629,15.925005,False
232,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_9,imosl,correct,generic,0.0,,0.28230378,14.279894,14.3651705,False
233,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_3,india,incorrect,generic,0.0,,0.33547658,13.216893,13.745905,True
234,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_13-part1,india,incorrect,generic,0.0,,0.22369854,15.681754,15.931013,True
235,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_2,elmosl,incorrect,generic,0.0,,0.23178063,16.243084,15.239063,True
236,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_21-part1,matharena,incorrect,matharena,0.0,,0.3107142,15.762904,14.326393,True
237,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_3,canada,incorrect,generic,0.0,,0.27488062,15.966637,14.304084,True
238,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_2,imosl,correct,generic,0.0,,0.2033393,18.568428,16.9789,False
239,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_1,bmosl,incorrect,generic,0.0,,0.1669663,15.14698,16.945896,True
240,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_9,turkey,detected,generic,0.0,,0.27770212,15.556089,14.661573,False
241,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_6-part1,bulgaria,incorrect,generic,0.0,,0.29851758,15.468619,14.261785,True
242,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_44,matharena,correct,matharena,0.0,,0.22919437,18.289705,16.250463,False
243,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_7,allrussian,incorrect,generic,0.0,,0.33892187,13.583684,13.434216,True
244,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_30,matharena,incorrect,matharena,0.0,,0.22480616,18.713196,15.733221,True
245,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_1-part1,spain,correct,generic,0.0,,0.22634564,17.288206,16.012297,False
246,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_16-part1,matharena,correct,matharena,0.0,,0.24321973,17.640188,16.400402,False
247,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_5-part1,bulgaria,detected,generic,0.0,,0.2241881,15.041965,16.13488,False
248,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_3-part1,bmosl,incorrect,generic,0.0,,0.17792933,16.51176,17.038198,True
249,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_6,izho,detected,generic,0.0,,0.2508052,16.317295,15.21593,False
250,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_2-part1,romania,correct,generic,0.0,,0.2646921,17.427235,15.159732,False
251,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_1,thai,detected,generic,0.0,,0.22489792,16.931465,16.471098,False
252,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_4-part1,spain,incorrect,generic,0.0,,0.21372458,15.740971,16.167698,True
253,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_7,elmosl,incorrect,generic,0.0,,0.26329762,15.506438,14.603936,True
254,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_21,matharena,detected,matharena,0.0,,0.25837773,17.570263,15.844258,False
255,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_28-part1,matharena,incorrect,matharena,0.0,,0.22344811,17.01388,15.943502,True
256,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_34,matharena,incorrect,matharena,0.0,,0.21443532,16.467665,17.120579,True
257,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_2-part1,israel,incorrect,generic,0.0,,0.30271998,14.697821,14.032648,True
258,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_1,rmm,incorrect,generic,0.0,,0.26334205,17.516865,14.484758,True
259,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_26-part1,matharena,correct,matharena,0.0,,0.19454688,18.237389,17.381683,False
260,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_12-part1,matharena,incorrect,matharena,0.0,,0.21623984,15.2436075,16.218533,True
261,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_7-part1,india,incorrect,generic,0.0,,0.28626376,14.687002,14.797032,True
262,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_31,matharena,correct,matharena,0.0,,0.21215688,17.35749,16.517231,False
263,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_2-part1,usatst,detected,generic,0.0,,0.26302692,16.069359,14.732594,False
264,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_3,german,detected,generic,0.0,,0.24860914,16.538511,15.688273,False
265,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_33,matharena,incorrect,matharena,0.0,,0.24463017,15.967872,15.381394,True
266,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_5-part1,pan,correct,generic,0.0,,0.23302105,17.662514,15.599316,False
267,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_11-part1,matharena,correct,matharena,0.0,,0.23086347,17.44482,16.055176,False
268,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_7-part1,bmosl,correct,generic,0.0,,0.2535269,17.114109,15.583117,False
269,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_17,matharena,correct,matharena,0.0,,0.18495041,17.175058,17.116413,False
270,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_21-part1,india,incorrect,generic,0.0,,0.22875589,16.362032,15.791496,True
271,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_1-part1,rmm,incorrect,generic,0.0,,0.27372393,16.565172,14.455468,True
272,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_16,india,incorrect,generic,0.0,,0.27061614,14.385787,14.738576,True
273,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_6,usamo,incorrect,generic,0.0,,0.3929569,15.407404,12.767899,True
274,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_28-part1,matharena,incorrect,matharena,0.0,,0.24191809,16.782642,15.275888,True
275,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_30-part1,imosl,incorrect,generic,0.0,,0.26531336,15.0522375,14.418568,True
276,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_3-part1,chinatst,detected,generic,0.0,,0.31955454,15.36613,13.934589,False
277,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_27-part1,matharena,incorrect,matharena,0.0,,0.17827846,16.99385,17.342207,True
278,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_2,canada,correct,generic,0.0,,0.2402669,16.03585,15.621675,False
279,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_4,matharena,correct,matharena,0.0,,0.13499373,22.260666,19.412764,False
280,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_6,matharena,detected,matharena,0.0,,0.21129705,16.455677,16.38678,False
281,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_9-part1,imosl,correct,generic,0.0,,0.3017678,14.17757,14.241214,False
282,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_40-part1,matharena,detected,matharena,0.0,,0.24047439,15.507604,15.563763,False
283,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_4,usatst,correct,generic,0.0,,0.26278675,16.885574,15.013294,False
284,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_25-part1,matharena,incorrect,matharena,0.0,,0.23569207,18.823109,16.091656,True
285,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_6,elmosl,incorrect,generic,0.0,,0.25389683,16.00366,15.537837,True
286,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_1,polish,detected,generic,0.0,,0.24695475,17.340107,15.854604,False
287,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_2-part1,nordic,detected,generic,0.0,,0.17836642,17.108452,17.292736,False
288,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_2,elmosl,incorrect,generic,0.0,,0.3521996,14.686035,13.974115,True
289,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_9-part1,matharena,incorrect,matharena,0.0,,0.17464843,16.370546,17.741003,True
290,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_11-part1,matharena,incorrect,matharena,0.0,,0.3605066,13.25602,13.334034,True
291,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_4,german,correct,generic,0.0,,0.1744453,19.187286,17.641443,False
292,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_21-part1,matharena,correct,matharena,0.0,,0.22266011,16.342886,16.443924,False
293,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_23-part1,matharena,detected,matharena,0.0,,0.20608796,16.915398,16.76065,False
294,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_15,india,incorrect,generic,0.0,,0.32607082,14.162984,13.519458,True
295,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_6-part1,matharena,incorrect,matharena,0.0,,0.27549282,17.513493,14.981586,True
296,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_11-part1,india,incorrect,generic,0.0,,0.2850858,15.106479,14.84153,True
297,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_12,chinatst,incorrect,generic,0.0,,0.23411766,16.780766,15.53231,True
298,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_27-part1,imosl,correct,generic,0.0,,0.25738308,15.515735,14.943782,False
299,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_3,elmosl,incorrect,generic,0.0,,0.21887635,18.649086,16.349644,True
300,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_6-part1,elmosl,incorrect,generic,0.0,,0.2037338,15.292317,16.90616,True
301,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_5-part1,rmm,detected,generic,0.0,,0.25845888,13.576719,15.211196,False
302,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_23,imosl,incorrect,generic,0.0,,0.1397606,15.779557,18.167238,True
303,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_2,china,incorrect,generic,0.0,,0.15244694,15.975225,17.652594,True
304,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_6,iran,incorrect,generic,0.0,,0.21107134,16.280342,16.37915,True
305,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_2,elmosl,correct,generic,0.0,,0.2622013,15.332441,14.751584,False
306,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_2,usatst,incorrect,generic,0.0,,0.253174,15.158353,14.771926,True
307,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_2-part1,allrussian,correct,generic,0.0,,0.21897846,14.594781,16.046604,False
308,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_33,matharena,incorrect,matharena,0.0,,0.20536546,18.763153,15.752227,True
309,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_4,egmo,corrected,generic,0.0,,0.2501969,16.06523,15.63838,False
310,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_14-part1,allrussian,detected,generic,0.0,,0.33184978,14.976141,13.836857,False
311,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_5,vietnam,detected,generic,0.0,,0.30499253,13.255719,14.159172,False
312,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_13,india,incorrect,generic,0.0,,0.2537535,15.056291,15.384975,True
313,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_3-part1,matharena,detected,matharena,0.0,,0.2405468,15.836189,15.971061,False
314,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_3-part1,romania,detected,generic,0.0,,0.2706943,14.512951,14.566274,False
315,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_14,imosl,incorrect,generic,0.0,,0.34687418,16.499252,13.652947,True
316,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_7,usatst,correct,generic,0.0,,0.22003947,18.220846,16.137295,False
317,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_37-part1,matharena,incorrect,matharena,0.0,,0.21053259,16.68189,15.9126425,True
318,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_32,matharena,correct,matharena,0.0,,0.10727815,18.357681,20.304314,False
319,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_1-part1,korea,correct,generic,0.0,,0.113339186,17.212032,18.171303,False
320,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_3,matharena,correct,matharena,0.0,,0.17520738,18.367292,17.622551,False
321,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_6-part1,pan,incorrect,generic,0.0,,0.36231872,15.752401,13.350475,True
322,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_26,matharena,correct,matharena,0.0,,0.20046331,18.219044,17.580078,False
323,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_2,jbmo,correct,generic,0.0,,0.23553789,17.180258,15.732356,False
324,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_26,matharena,incorrect,matharena,0.0,,0.25281692,16.253536,16.242834,True
325,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_6-part1,elmosl,incorrect,generic,0.0,,0.26535985,16.118677,14.665399,True
326,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_7-part1,usatst,correct,generic,0.0,,0.21916966,17.03268,16.263601,False
327,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_1,greece,detected,generic,0.0,,0.2834216,17.76097,15.165116,False
328,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_3-part1,german,detected,generic,0.0,,0.13770951,20.447454,19.236418,False
329,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_2,nordic,detected,generic,0.0,,0.25108376,14.04081,15.341858,False
330,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_1-part1,matharena,detected,matharena,0.0,,0.25355744,17.565763,15.473786,False
331,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_11,matharena,incorrect,matharena,0.0,,0.20209189,16.96481,16.977232,True
332,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_16,imosl,incorrect,generic,0.0,,0.25463057,16.009872,15.883864,True
333,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_20-part1,matharena,incorrect,matharena,0.0,,0.17758438,16.67039,17.070503,True
334,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_4,usamo,correct,generic,0.0,,0.24116644,15.010148,16.199648,False
335,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_6,matharena,detected,matharena,0.0,,0.23079903,17.230099,16.100662,False
336,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_19,matharena,detected,matharena,0.0,,0.1698651,16.02191,17.19473,False
337,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_3-part1,turkey,correct,generic,0.0,,0.24419115,15.236082,15.167022,False
338,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_8-part1,elmosl,detected,generic,0.0,,0.20148438,19.00248,16.732574,False
339,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_32,imosl,incorrect,generic,0.0,,0.3007906,15.574796,13.865297,True
340,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_3,matharena,incorrect,matharena,0.0,,0.20655684,17.558506,16.45951,True
341,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_4,jbmo,correct,generic,0.0,,0.26894757,15.387435,14.68866,False
342,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_1-part1,romania,incorrect,generic,0.0,,0.28518653,14.213148,14.494153,True
343,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_2,rmm,incorrect,generic,0.0,,0.26048988,13.973329,14.658242,True
344,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_4-part1,elmosl,incorrect,generic,0.0,,0.1523187,17.294735,17.9082,True
345,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_4,bulgaria,incorrect,generic,0.0,,0.21334696,15.9706,15.974364,True
346,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_4-part1,elmosl,incorrect,generic,0.0,,0.30650246,16.39136,14.290352,True
347,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_4,israel,incorrect,generic,0.0,,0.26612923,18.717497,15.363515,True
348,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_5,elmosl,incorrect,generic,0.0,,0.26574215,15.092896,14.381414,True
349,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_13-part1,imosl,incorrect,generic,0.0,,0.34867588,12.953356,13.224275,True
350,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_34,imosl,incorrect,generic,0.0,,0.28844008,15.883138,14.784599,True
351,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_10,india,incorrect,generic,0.0,,0.30531174,14.565241,14.08912,True
352,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_48,matharena,correct,matharena,0.0,,0.20232962,18.482027,17.002068,False
353,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_4,bmosl,correct,generic,0.0,,0.19434369,18.728504,16.785448,False
354,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_5,polish,detected,generic,0.0,,0.30974615,13.796326,14.067598,False
355,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_49-part1,matharena,detected,matharena,0.0,,0.16191873,16.886562,17.863222,False
356,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_17-part1,chinatst,incorrect,generic,0.0,,0.3540768,16.513603,13.458163,True
357,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_1-part1,elmosl,detected,generic,0.0,,0.24513851,15.542827,15.7184105,False
358,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_1-part1,usatst,detected,generic,0.0,,0.33587027,14.080598,13.727487,False
359,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_12,matharena,incorrect,matharena,0.0,,0.24006642,15.78721,15.736588,True
360,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_5,israel,incorrect,generic,0.0,,0.24504101,18.141184,15.393101,True
361,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_4-part1,allrussian,incorrect,generic,0.0,,0.29261738,14.011498,14.707948,True
362,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_1-part1,greece,detected,generic,0.0,,0.23110121,17.326757,16.394232,False
363,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_9,elmosl,incorrect,generic,0.0,,0.18284108,17.270208,16.230722,True
364,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_1,spain,correct,generic,0.0,,0.24109198,19.097157,15.824242,False
365,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_3,bulgaria,detected,generic,0.0,,0.32092342,14.338957,13.812549,False
366,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_8-part1,usatst,incorrect,generic,0.0,,0.24559471,16.477428,15.303112,True
367,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_2-part1,romania,incorrect,generic,0.0,,0.24920663,16.722887,15.185328,True
368,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_1-part1,chinatst,incorrect,generic,0.0,,0.19951764,16.986645,15.906034,True
369,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_8,matharena,correct,matharena,0.0,,0.24594373,17.26175,15.764101,False
370,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_6,matharena,incorrect,matharena,0.0,,0.24505061,17.821312,15.889081,True
371,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_12,india,correct,generic,0.0,,0.24997607,14.596922,15.792421,False
372,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_3-part1,elmosl,correct,generic,0.0,,0.24640785,14.007657,15.183085,False
373,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_10,israel,incorrect,generic,0.0,,0.3420555,14.914147,13.5112705,True
374,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_1,korea,incorrect,generic,0.0,,0.19194521,15.714765,16.423628,True
375,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_4-part1,pan,correct,generic,0.0,,0.19447911,20.606964,17.211409,False
376,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_3,bmo,corrected,generic,0.0,,0.23839428,18.484093,15.721416,False
377,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_6-part1,korea,incorrect,generic,0.0,,0.27278098,15.577771,14.546871,True
378,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_3-part1,india,incorrect,generic,0.0,,0.30740047,15.253498,14.087842,True
379,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_5-part1,elmosl,incorrect,generic,0.0,,0.2948751,14.621295,14.652074,True
380,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_34-part1,imosl,incorrect,generic,0.0,,0.2741045,13.600708,15.163346,True
381,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_23,matharena,correct,matharena,0.0,,0.17241798,21.254936,17.60219,False
382,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_2,vietnam,correct,generic,0.0,,0.19962163,16.254665,16.924725,False
383,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_3,israel,detected,generic,0.0,,0.24763367,16.71425,15.839856,False
384,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_8-part1,matharena,correct,matharena,0.0,,0.231486,17.41377,16.293285,False
385,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_5-part1,turkey,incorrect,generic,0.0,,0.3159169,14.940583,13.829753,True
386,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_53,matharena,incorrect,matharena,0.0,,0.26246262,13.650621,14.516337,True
387,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_18,chinatst,detected,generic,0.0,,0.2829356,15.716245,14.18926,False
388,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_11-part1,matharena,incorrect,matharena,0.0,,0.2558515,16.21003,16.044054,True
389,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_5,india,incorrect,generic,0.0,,0.20318764,16.379545,16.121601,True
390,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_14-part1,matharena,incorrect,matharena,0.0,,0.22936326,16.399677,15.883338,True
391,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_5-part1,egmo,incorrect,generic,0.0,,0.28916976,14.849727,14.149651,True
392,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_26-part1,matharena,correct,matharena,0.0,,0.24040647,19.78501,16.138802,False
393,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_3-part1,matharena,correct,matharena,0.0,,0.16016641,18.153593,17.784246,False
394,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_39-part1,matharena,detected,matharena,0.0,,0.23862018,15.935817,15.937058,False
395,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_2-part1,jbmo,correct,generic,0.0,,0.22456749,15.059033,15.966641,False
396,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_16-part1,matharena,correct,matharena,0.0,,0.24924819,19.696045,15.85192,False
397,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_2,chinatst,incorrect,generic,0.0,,0.16626352,18.476357,17.539455,True
398,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_1-part1,canada,incorrect,generic,0.0,,0.29797474,16.52744,14.427556,True
399,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_18-part1,matharena,incorrect,matharena,0.0,,0.3159515,14.954004,14.150563,True
400,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_17-part1,matharena,incorrect,matharena,0.0,,0.3601001,13.429772,13.184072,True
401,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_3,matharena,detected,matharena,0.0,,0.22756405,15.630099,16.441614,False
402,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_3-part1,romania,incorrect,generic,0.0,,0.18399587,16.760105,16.232798,True
403,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_18,matharena,incorrect,matharena,0.0,,0.30359796,17.032333,14.516408,True
404,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_11,allrussian,incorrect,generic,0.0,,0.22667535,16.379316,15.694183,True
405,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_39,matharena,incorrect,matharena,0.0,,0.17121662,16.316835,17.900303,True
406,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_22-part1,matharena,correct,matharena,0.0,,0.33516666,15.077667,13.7971115,False
407,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_1-part1,egmo,correct,generic,0.0,,0.25835198,15.520491,15.445197,False
408,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_9,matharena,detected,matharena,0.0,,0.26017317,15.2736025,15.1438875,False
409,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_3,matharena,correct,matharena,0.0,,0.22501475,17.674871,16.043818,False
410,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_18-part1,matharena,incorrect,matharena,0.0,,0.2009569,16.263885,16.535309,True
411,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_1,bulgaria,corrected,generic,0.0,,0.20564677,18.561762,16.647545,False
412,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_7-part1,philippines,detected,generic,0.0,,0.22924319,15.875981,15.796967,False
413,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_5,usamo,incorrect,generic,0.0,,0.19249777,19.024649,17.087236,True
414,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_4-part1,canada,incorrect,generic,0.0,,0.24953306,14.757966,15.18054,True
415,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_1,philippines,correct,generic,0.0,,0.25415906,14.421349,15.240984,False
416,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_23,matharena,correct,matharena,0.0,,0.23905595,17.14483,15.981003,False
417,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_8-part1,india,incorrect,generic,0.0,,0.32680127,15.349813,13.823149,True
418,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_43,matharena,incorrect,matharena,0.0,,0.27455032,14.500297,14.786467,True
419,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_4,bmo,incorrect,generic,0.0,,0.30898124,16.699316,13.875181,True
420,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_10-part1,india,incorrect,generic,0.0,,0.3089826,15.192212,14.091898,True
421,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_4-part1,rmm,detected,generic,0.0,,0.29644984,14.563846,14.1382475,False
422,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_8,iran,incorrect,generic,0.0,,0.35524693,16.882612,13.752155,True
423,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_3-part1,matharena,correct,matharena,0.0,,0.251589,17.451414,15.416078,False
424,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_15,matharena,incorrect,matharena,0.0,,0.3298333,16.24043,14.15808,True
425,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_23-part1,imosl,incorrect,generic,0.0,,0.28644907,16.591583,14.423187,True
426,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_5-part1,matharena,incorrect,matharena,0.0,,0.1865996,16.797182,16.57934,True
427,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_7,imosl,incorrect,generic,0.0,,0.19369707,16.69608,16.178274,True
428,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_9-part1,elmosl,incorrect,generic,0.0,,0.31283304,14.784936,14.21367,True
429,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_6,bmosl,correct,generic,0.0,,0.2111678,13.768552,16.626793,False
430,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_7,iran,incorrect,generic,0.0,,0.22237632,15.999067,15.7263,True
431,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_1,pan,detected,generic,0.0,,0.19650266,16.327656,17.127905,False
432,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_6,turkey,incorrect,generic,0.0,,0.23821314,16.76677,15.338001,True
433,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_19-part1,chinatst,incorrect,generic,0.0,,0.3531608,13.373158,13.383157,True
434,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_27-part1,matharena,incorrect,matharena,0.0,,0.22726865,16.157091,15.901111,True
435,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_8-part1,iran,incorrect,generic,0.0,,0.30430943,14.754824,14.5430155,True
436,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_17-part1,imosl,incorrect,generic,0.0,,0.23101111,14.596598,15.926154,True
437,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_1-part1,matharena,correct,matharena,0.0,,0.21645327,19.057804,16.34505,False
438,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_12,matharena,incorrect,matharena,0.0,,0.24539071,17.852015,15.667994,True
439,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_21,matharena,correct,matharena,0.0,,0.2249195,19.437138,16.435478,False
440,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_5,pan,detected,generic,0.0,,0.22701198,17.998766,15.701584,False
441,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_9-part1,chinatst,detected,generic,0.0,,0.27383617,17.63958,14.88006,False
442,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_1-part1,allrussian,incorrect,generic,0.0,,0.24461807,16.322495,15.580376,True
443,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_13,allrussian,detected,generic,0.0,,0.24599215,18.15087,15.994466,False
444,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_7,matharena,correct,matharena,0.0,,0.2166579,21.764364,16.473345,False
445,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_17-part1,matharena,correct,matharena,0.0,,0.15475066,18.29669,19.081438,False
446,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_6-part1,china,incorrect,generic,0.0,,0.17062664,15.660512,16.798101,True
447,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_36-part1,imosl,incorrect,generic,0.0,,0.3333723,14.898372,13.635409,True
448,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_3-part1,izho,incorrect,generic,0.0,,0.2280339,15.30259,15.481602,True
449,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_35-part1,matharena,correct,matharena,0.0,,0.1739488,17.078604,17.253815,False
450,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_5,thai,incorrect,generic,0.0,,0.28575447,15.535319,14.395213,True
451,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_42-part1,matharena,incorrect,matharena,0.0,,0.3247728,13.479532,14.119188,True
452,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_3,nordic,correct,generic,0.0,,0.21248078,14.885613,16.713772,False
453,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_4-part1,egmo,correct,generic,0.0,,0.28053364,14.566245,14.518455,False
454,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_38,matharena,incorrect,matharena,0.0,,0.16566369,15.671014,17.118423,True
455,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_5-part1,matharena,correct,matharena,0.0,,0.2831549,16.489767,14.908489,False
456,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_6-part1,iran,incorrect,generic,0.0,,0.17980221,17.949997,16.78285,True
457,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_2,bmosl,detected,generic,0.0,,0.19245267,15.734021,16.882812,False
458,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_23,matharena,correct,matharena,0.0,,0.21950115,21.106237,16.945633,False
459,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_25-part1,matharena,incorrect,matharena,0.0,,0.24569944,15.352221,15.724598,True
460,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_1-part1,vietnam,correct,generic,0.0,,0.19399238,17.526358,17.0239,False
461,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_26-part1,matharena,correct,matharena,0.0,,0.18841758,17.295418,17.26035,False
462,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_29-part1,matharena,correct,matharena,0.0,,0.18570505,17.540203,17.367542,False
463,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_2,iran,incorrect,generic,0.0,,0.24188818,15.871916,15.05755,True
464,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_16-part1,imosl,incorrect,generic,0.0,,0.32362735,16.496681,14.503261,True
465,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_4,elmosl,incorrect,generic,0.0,,0.19455278,16.040577,16.45602,True
466,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_7,bmosl,incorrect,generic,0.0,,0.20140609,15.0081835,16.61132,True
467,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_5,matharena,detected,matharena,0.0,,0.224153,15.386337,16.483799,False
468,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_2-part1,spain,incorrect,generic,0.0,,0.24376881,14.230733,15.404153,True
469,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_12-part1,india,correct,generic,0.0,,0.23802641,13.640352,15.993021,False
470,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_3-part1,pan,corrected,generic,0.0,,0.19657719,18.755327,17.042631,False
471,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_1,bmosl,incorrect,generic,0.0,,0.35706422,13.171278,13.211558,True
472,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_27-part1,matharena,detected,matharena,0.0,,0.1721893,18.57409,17.449583,False
473,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_1,matharena,correct,matharena,0.0,,0.23280105,16.841997,15.960268,False
474,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_20,chinatst,incorrect,generic,0.0,,0.23131464,17.352964,15.793201,True
475,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_8,philippines,incorrect,generic,0.0,,0.2357558,15.715294,15.439984,True
476,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_11,india,incorrect,generic,0.0,,0.2804032,14.930104,14.841453,True
477,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_19,india,incorrect,generic,0.0,,0.2869958,15.717448,14.779788,True
478,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_31-part1,matharena,correct,matharena,0.0,,0.24818975,18.575232,15.575279,False
479,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_3-part1,spain,incorrect,generic,0.0,,0.9387597,16.806314,12.646279,True
480,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_1-part1,bmosl,incorrect,generic,0.0,,0.3525922,15.327795,13.326158,True
481,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_1,india,detected,generic,0.0,,0.2544778,18.112223,15.95884,False
482,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_10-part1,matharena,detected,matharena,0.0,,0.30012396,14.170396,13.851357,False
483,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_2,usamo,incorrect,generic,0.0,,0.2924149,15.229164,13.981417,True
484,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_15,allrussian,incorrect,generic,0.0,,0.23431614,15.3108635,15.943504,True
485,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_16-part1,allrussian,incorrect,generic,0.0,,0.2807136,16.613636,14.458209,True
486,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_2-part1,pan,correct,generic,0.0,,0.2234925,19.551462,17.07902,False
487,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_33,imosl,detected,generic,0.0,,0.17318062,15.522119,17.155111,False
488,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_20,matharena,detected,matharena,0.0,,0.20097707,16.433237,16.3493,False
489,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_15,matharena,correct,matharena,0.0,,0.25878918,17.914238,16.122007,False
490,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_6,pan,incorrect,generic,0.0,,0.4269182,13.373099,12.135142,True
491,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_2,philippines,incorrect,generic,0.0,,0.2575234,14.123833,14.789688,True
492,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_5-part1,allrussian,incorrect,generic,0.0,,0.15053435,17.436083,17.66588,True
493,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_4-part1,matharena,correct,matharena,0.0,,0.15365928,21.733917,19.009132,False
494,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_8,india,incorrect,generic,0.0,,0.3187966,14.328686,13.539672,True
495,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_4,korea,correct,generic,0.0,,0.199402,16.841316,16.910358,False
496,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_1,romania,incorrect,generic,0.0,,0.22573248,15.585502,15.376174,True
497,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_10,chinatst,incorrect,generic,0.0,,0.27119818,17.81529,15.14368,True
498,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_1,matharena,detected,matharena,0.0,,0.16003877,17.298073,18.366812,False
499,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_9,matharena,incorrect,matharena,0.0,,0.16001673,16.79102,18.2298,True
500,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_35,imosl,detected,generic,0.0,,0.25826705,13.8047695,15.513055,False
501,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_1-part1,philippines,incorrect,generic,0.0,,0.23147802,15.299237,15.542343,True
502,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_1-part1,matharena,detected,matharena,0.0,,0.26742682,17.233362,15.557369,False
503,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_9_2025_1,romania,incorrect,generic,0.0,,0.1454735,19.756098,17.920765,True
504,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_10,matharena,incorrect,matharena,0.0,,0.16714948,18.008287,17.37256,True
505,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_3-part1,egmo,incorrect,generic,0.0,,0.14108889,17.331451,18.202028,True
506,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_8-part1,elmosl,detected,generic,0.0,,0.27919656,16.439066,14.613155,False
507,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_3,romania,detected,generic,0.0,,0.27087137,15.337447,14.6408825,False
508,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_1,allrussian,incorrect,generic,0.0,,0.23592845,16.957052,15.589729,True
509,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_8,elmosl,incorrect,generic,0.0,,0.22161075,15.49593,15.981992,True
510,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_6,matharena,correct,matharena,0.0,,0.24254934,17.891571,15.946294,False
511,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_13-part1,allrussian,detected,generic,0.0,,0.29407552,15.105328,14.827616,False
512,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_6,polish,detected,generic,0.0,,0.18372722,20.016386,17.478094,False
513,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_10,matharena,incorrect,matharena,0.0,,0.25928676,17.594515,15.246109,True
514,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_24-part1,matharena,correct,matharena,0.0,,0.22178036,17.93663,16.410486,False
515,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_7-part1,elmosl,incorrect,generic,0.0,,0.25567368,17.884003,14.736357,True
516,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_25,matharena,correct,matharena,0.0,,0.2666853,17.889753,15.66317,False
517,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_11-part1,chinatst,correct,generic,0.0,,0.23072425,18.678381,15.738828,False
518,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_29-part1,matharena,correct,matharena,0.0,,0.20710236,15.699839,17.108513,False
519,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_4,serbia,incorrect,generic,0.0,,0.17326824,17.695572,16.789806,True
520,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_3,iran,incorrect,generic,0.0,,0.33082247,15.733652,13.581942,True
521,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_4,india,incorrect,generic,0.0,,0.30771765,15.210249,14.047945,True
522,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_1-part1,jbmo,detected,generic,0.0,,0.16761996,18.788288,17.817467,False
523,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_1-part1,turkey,incorrect,generic,0.0,,0.28333718,17.94919,14.839868,True
524,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_5-part1,spain,detected,generic,0.0,,0.22659487,18.506567,15.50956,False
525,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_23-part1,matharena,correct,matharena,0.0,,0.24146886,16.567024,15.800927,False
526,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_7-part1,matharena,correct,matharena,0.0,,0.23207028,19.920208,16.351402,False
527,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_1,romania,detected,generic,0.0,,0.27311748,17.350863,14.547884,False
528,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_8-part1,thai,incorrect,generic,0.0,,0.32280427,13.556612,13.761227,True
529,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_20,matharena,detected,matharena,0.0,,0.19705024,15.281287,16.588118,False
530,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_7-part1,matharena,detected,matharena,0.0,,0.16199574,17.667452,18.25851,False
531,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_5-part1,bmosl,incorrect,generic,0.0,,0.26979747,16.3462,14.702097,True
532,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_5,chinatst,detected,generic,0.0,,0.3358202,15.89372,13.788612,False
533,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_15-part1,india,incorrect,generic,0.0,,0.33673313,15.003761,13.327189,True
534,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_26-part1,matharena,correct,matharena,0.0,,0.2068801,18.07531,16.89805,False
535,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_5,allrussian,correct,generic,0.0,,0.24251854,16.207222,15.386122,False
536,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_15-part1,allrussian,incorrect,generic,0.0,,0.25948286,15.191289,15.085338,True
537,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_9,india,correct,generic,0.0,,0.21805458,15.579703,16.446545,False
538,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_8-part1,chinatst,incorrect,generic,0.0,,0.22517084,15.190371,16.44579,True
539,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_2,german,incorrect,generic,0.0,,0.23074207,17.810848,15.888044,True
540,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_3-part1,korea,incorrect,generic,0.0,,0.24889556,15.592433,15.453138,True
541,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_1,canada,incorrect,generic,0.0,,0.31294605,14.976259,14.225817,True
542,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_24,matharena,incorrect,matharena,0.0,,0.23826556,19.023039,15.814024,True
543,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_3,jbmo,incorrect,generic,0.0,,0.15149401,19.64081,18.541622,True
544,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_3,korea,incorrect,generic,0.0,,0.22783257,14.442324,15.981469,True
545,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_2,india,incorrect,generic,0.0,,0.31909236,20.16627,14.801156,True
546,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_5,izho,incorrect,generic,0.0,,0.21680279,14.395027,16.050928,True
547,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_12,imosl,incorrect,generic,0.0,,0.34264043,15.010072,13.799927,True
548,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_5-part1,china,incorrect,generic,0.0,,0.21207812,16.796856,15.906487,True
549,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_21,matharena,detected,matharena,0.0,,0.24286313,16.18707,15.57186,False
550,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_5-part1,india,incorrect,generic,0.0,,0.30622047,15.910267,14.443104,True
551,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_3,egmo,corrected,generic,0.0,,0.20734654,15.411879,16.61067,False
552,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_49,matharena,detected,matharena,0.0,,0.21326296,16.278942,16.145918,False
553,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_7,matharena,incorrect,matharena,0.0,,0.33196467,13.922309,13.574249,True
554,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_4-part1,philippines,incorrect,generic,0.0,,0.22670254,14.761546,15.708488,True
555,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_3-part1,philippines,detected,generic,0.0,,0.22796084,15.956915,15.50298,False
556,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_6,thai,correct,generic,0.0,,0.23587504,16.559015,15.423439,False
557,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_25,matharena,incorrect,matharena,0.0,,0.244988,16.118599,15.226632,True
558,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_6,chinatst,incorrect,generic,0.0,,0.19300404,17.665607,16.260033,True
559,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_8,matharena,incorrect,matharena,0.0,,0.16678983,14.388144,17.196,True
560,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_1-part1,elmosl,incorrect,generic,0.0,,0.26149467,16.89185,15.505857,True
561,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_14,india,detected,generic,0.0,,0.20698124,14.964991,16.699955,False
562,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_2,turkey,incorrect,generic,0.0,,0.22528352,17.128452,16.283546,True
563,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_6-part1,matharena,correct,matharena,0.0,,0.22650477,17.200966,16.008427,False
564,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_10,thai,detected,generic,0.0,,0.26192623,18.40335,15.349049,False
565,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_6,philippines,incorrect,generic,0.0,,0.2581569,15.916015,14.561786,True
566,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_4-part1,usatst,correct,generic,0.0,,0.26825136,17.010948,14.792088,False
567,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_6-part1,matharena,correct,matharena,0.0,,0.20148632,19.266422,17.340363,False
568,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_3,romania,correct,generic,0.0,,0.22605531,16.63085,15.754007,False
569,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_3,chinatst,detected,generic,0.0,,0.3225831,15.993744,13.870469,False
570,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_9,elmosl,incorrect,generic,0.0,,0.29994974,15.177157,14.699035,True
571,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_10-part1,thai,incorrect,generic,0.0,,0.28438306,15.940155,15.047502,True
572,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_29-part1,imosl,incorrect,generic,0.0,,0.20591307,14.391474,15.935488,True
573,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_14,matharena,correct,matharena,0.0,,0.1923347,17.993073,17.281374,False
574,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_3,imosl,detected,generic,0.0,,0.17704518,20.49046,17.91198,False
575,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_5,matharena,correct,matharena,0.0,,0.24483764,17.243374,16.061522,False
576,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_5-part1,bmosl,incorrect,generic,0.0,,0.2205517,17.48072,16.403982,True
577,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_7,bmosl,detected,generic,0.0,,0.25092804,16.731207,15.612248,False
578,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_21,imosl,incorrect,generic,0.0,,0.29803783,16.068752,14.227328,True
579,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_1,matharena,correct,matharena,0.0,,0.27051663,16.886486,15.308114,False
580,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_20-part1,imosl,correct,generic,0.0,,0.21963964,14.266091,16.079231,False
581,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_5,bmosl,corrected,generic,0.0,,0.2391232,19.214237,16.228073,False
582,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_31-part1,matharena,correct,matharena,0.0,,0.22123608,17.134926,16.943853,False
583,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_11,imosl,incorrect,generic,0.0,,0.36786768,14.682892,13.024283,True
584,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_2-part1,bmosl,detected,generic,0.0,,0.1702614,17.676153,17.346478,False
585,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_6-part1,turkey,incorrect,generic,0.0,,0.22939186,16.129301,16.301878,True
586,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_13,imosl,incorrect,generic,0.0,,0.36926678,14.80378,12.990184,True
587,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_3-part1,serbia,detected,generic,0.0,,0.17015815,16.91575,17.206429,False
588,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_8,elmosl,detected,generic,0.0,,0.2267386,17.406736,15.916389,False
589,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_42,matharena,incorrect,matharena,0.0,,0.31671262,13.763018,14.255425,True
590,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_21-part1,chinatst,incorrect,generic,0.0,,0.39309368,15.927447,12.4885,True
591,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_9,usatst,incorrect,generic,0.0,,0.19188222,15.943686,17.297293,True
592,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_9,allrussian,correct,generic,0.0,,0.26512873,16.21922,15.072804,False
593,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_7,thai,correct,generic,0.0,,0.20448425,16.929663,16.323969,False
594,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_20-part1,matharena,incorrect,matharena,0.0,,0.4093311,13.016538,12.329929,True
595,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_26,matharena,correct,matharena,0.0,,0.22180116,16.817709,16.574223,False
596,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_2-part1,matharena,correct,matharena,0.0,,0.24444728,17.662506,16.293709,False
597,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_12-part1,matharena,correct,matharena,0.0,,0.23500976,18.428265,16.177467,False
598,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_46-part1,matharena,detected,matharena,0.0,,0.31237313,13.243141,14.359995,False
599,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_6-part1,egmo,detected,generic,0.0,,0.27663305,14.770616,14.796693,False
600,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_34-part1,matharena,incorrect,matharena,0.0,,0.22833766,15.388701,16.331148,True
601,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_1-part1,india,correct,generic,0.0,,0.18602332,17.9284,17.089876,False
602,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_4-part1,bmosl,correct,generic,0.0,,0.22779778,13.995879,15.890055,False
603,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_1,matharena,correct,matharena,0.0,,0.2243756,19.13789,16.062004,False
604,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_2-part1,romania,correct,generic,0.0,,0.22628987,19.645012,16.078173,False
605,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_44-part1,matharena,correct,matharena,0.0,,0.1771931,21.081501,18.056627,False
606,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_6,israel,incorrect,generic,0.0,,0.21013716,15.297094,16.12502,True
607,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_4-part1,polish,detected,generic,0.0,,0.32518533,14.954977,13.966554,False
608,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_1,chinatst,incorrect,generic,0.0,,0.20164314,15.4436035,15.744635,True
609,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_10-part1,matharena,detected,matharena,0.0,,0.24928112,17.396753,15.650787,False
610,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_3,bmosl,incorrect,generic,0.0,,0.1739133,15.677893,16.952219,True
611,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_25-part1,matharena,detected,matharena,0.0,,0.23635444,14.391855,15.764302,False
612,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_2,polish,detected,generic,0.0,,0.24746504,17.9979,15.312555,False
613,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_7,turkey,incorrect,generic,0.0,,0.18182416,15.013046,17.440363,True
614,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_8-part1,turkey,incorrect,generic,0.0,,0.1976009,15.659243,16.342186,True
615,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_2-part1,matharena,correct,matharena,0.0,,0.15895894,23.028448,18.81103,False
616,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_15-part1,matharena,detected,matharena,0.0,,0.24387722,15.974445,15.104274,False
617,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_13,matharena,corrected,matharena,0.0,,0.2588361,16.613176,15.58322,False
618,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_5,matharena,incorrect,matharena,0.0,,0.22149248,16.932306,15.69772,True
619,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_3-part1,romania,incorrect,generic,0.0,,0.20780225,17.04765,16.145964,True
620,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_10,matharena,incorrect,matharena,0.0,,0.30788207,14.438082,14.106367,True
621,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_8-part1,matharena,correct,matharena,0.0,,0.22586972,16.267622,16.284863,False
622,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_20,india,incorrect,generic,0.0,,0.25152722,14.882249,15.201367,True
623,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_12-part1,matharena,correct,matharena,0.0,,0.2349028,15.982268,16.165777,False
624,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_3,bmosl,incorrect,generic,0.0,,0.25690693,17.801352,15.034115,True
625,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_5,egmo,incorrect,generic,0.0,,0.29431254,13.886413,14.492917,True
626,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_7-part1,elmosl,incorrect,generic,0.0,,0.16857325,17.371548,17.421444,True
627,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_1-part1,elmosl,incorrect,generic,0.0,,0.21460152,16.519846,16.200792,True
628,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_4-part1,korea,incorrect,generic,0.0,,0.18321136,17.12968,17.409231,True
629,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_19,matharena,incorrect,matharena,0.0,,0.2938631,14.946758,14.990851,True
630,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_3-part1,elmosl,incorrect,generic,0.0,,0.25003764,17.256691,15.490411,True
631,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_11-part1,matharena,incorrect,matharena,0.0,,0.20110667,16.371922,17.306044,True
632,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_29,imosl,incorrect,generic,0.0,,0.22757322,15.0234785,15.448117,True
633,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_2-part1,canada,detected,generic,0.0,,0.24518196,15.860941,15.577962,False
634,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_22-part1,chinatst,incorrect,generic,0.0,,0.2280967,14.727134,15.768263,True
635,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_2-part1,bmosl,incorrect,generic,0.0,,0.19335909,17.753784,16.80387,True
636,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_13-part1,matharena,incorrect,matharena,0.0,,0.30660114,15.513307,14.591909,True
637,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_4-part1,matharena,correct,matharena,0.0,,0.21589163,19.96265,16.256163,False
638,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_30-part1,matharena,incorrect,matharena,0.0,,0.19216178,18.266531,16.618946,True
639,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_2,india,incorrect,generic,0.0,,0.18668689,16.458723,16.748962,True
640,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_4,chinatst,incorrect,generic,0.0,,0.2674205,14.601124,14.9829645,True
641,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_7,philippines,incorrect,generic,0.0,,0.23375449,16.649567,15.666099,True
642,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_22,matharena,correct,matharena,0.0,,0.2131735,18.588596,16.795002,False
643,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_5,elmosl,incorrect,generic,0.0,,0.35131887,14.002565,12.622833,True
644,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_1,imosl,correct,generic,0.0,,0.22941494,19.037739,15.710263,False
645,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_6-part1,israel,incorrect,generic,0.0,,0.24846148,16.78135,15.170626,True
646,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_4,bmosl,incorrect,generic,0.0,,0.23984928,15.3679905,15.568592,True
647,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_12,allrussian,incorrect,generic,0.0,,0.2509016,15.2790365,15.724593,True
648,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_2,romania,detected,generic,0.0,,0.30541223,15.172498,14.000428,False
649,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_30,imosl,incorrect,generic,0.0,,0.22916284,16.70082,15.244629,True
650,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_5-part1,matharena,detected,matharena,0.0,,0.2195949,13.703046,16.011902,False
651,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_6-part1,bmosl,detected,generic,0.0,,0.22643128,16.950296,15.916162,False
652,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_5,china,incorrect,generic,0.0,,0.20047468,17.066969,16.21695,True
653,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_4-part1,german,correct,generic,0.0,,0.19234121,17.287636,17.405897,False
654,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_3,bmosl,incorrect,generic,0.0,,0.3176693,14.203827,13.540966,True
655,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_1-part1,serbia,incorrect,generic,0.0,,0.22661194,16.695242,16.080162,True
656,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_6-part1,bmosl,incorrect,generic,0.0,,0.21262226,15.086718,15.910429,True
657,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_28-part1,imosl,incorrect,generic,0.0,,0.25757983,14.881655,14.700022,True
658,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_6-part1,india,incorrect,generic,0.0,,0.2215943,15.836359,16.194998,True
659,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_23-part1,matharena,correct,matharena,0.0,,0.22416317,20.315561,16.953762,False
660,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_7-part1,matharena,correct,matharena,0.0,,0.27044845,18.186335,15.534341,False
661,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_13,matharena,detected,matharena,0.0,,0.28009585,14.243106,14.947221,False
662,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_1-part1,izho,incorrect,generic,0.0,,0.21215123,20.277304,17.407536,True
663,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_1-part1,romania,detected,generic,0.0,,0.19539905,15.941687,16.026094,False
664,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_13-part1,matharena,incorrect,matharena,0.0,,0.32204518,14.4267435,13.682612,True
665,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_10-part1,matharena,correct,matharena,0.0,,0.17996901,18.902094,16.602552,False
666,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_17-part1,matharena,incorrect,matharena,0.0,,0.20360258,15.482036,16.087923,True
667,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_5-part1,korea,incorrect,generic,0.0,,0.3142319,16.033644,13.991715,True
668,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_12-part1,allrussian,incorrect,generic,0.0,,0.17396496,17.295904,17.785465,True
669,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_6,bmosl,correct,generic,0.0,,0.26988786,15.935965,14.839265,False
670,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_1,egmo,detected,generic,0.0,,0.22153004,15.539378,16.107922,False
671,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_6,bulgaria,incorrect,generic,0.0,,0.27843356,15.20799,14.469181,True
672,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_7,matharena,incorrect,matharena,0.0,,0.17733563,17.062271,17.076141,True
673,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_21-part1,matharena,correct,matharena,0.0,,0.2625891,21.367392,16.192984,False
674,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_9,matharena,correct,matharena,0.0,,0.18657428,20.001837,17.828388,False
675,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_29,matharena,correct,matharena,0.0,,0.19907714,15.421004,17.020073,False
676,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_30,matharena,incorrect,matharena,0.0,,0.24205573,15.082662,15.525499,True
677,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_3-part1,thai,detected,generic,0.0,,0.18398914,15.923081,17.0573,False
678,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_29,matharena,incorrect,matharena,0.0,,0.19908513,17.57945,16.912338,True
679,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_1-part1,bmosl,correct,generic,0.0,,0.2473044,16.797716,15.406246,False
680,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_10-part1,matharena,incorrect,matharena,0.0,,0.27133414,17.019922,15.287499,True
681,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_38,matharena,incorrect,matharena,0.0,,0.22625983,14.308235,15.920453,True
682,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_7,matharena,detected,matharena,0.0,,0.20770346,15.752693,17.058325,False
683,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_6,china,incorrect,generic,0.0,,0.23592082,15.085815,15.854905,True
684,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_10,matharena,correct,matharena,0.0,,0.18267176,16.702885,16.47794,False
685,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_30-part1,matharena,incorrect,matharena,0.0,,0.2021061,17.997282,16.879503,True
686,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_7,india,incorrect,generic,0.0,,0.22890131,15.831054,16.217361,True
687,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_36-part1,matharena,correct,matharena,0.0,,0.21644618,17.782272,16.349297,False
688,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_13,matharena,incorrect,matharena,0.0,,0.14748488,17.61625,18.627592,True
689,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_1,usamo,incorrect,generic,0.0,,0.202095,17.900606,16.548838,True
690,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_21-part1,matharena,incorrect,matharena,0.0,,0.24832593,16.85909,15.167853,True
691,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_3-part1,bmo,corrected,generic,0.0,,0.22384676,16.353012,15.702017,False
692,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_5,matharena,correct,matharena,0.0,,0.2970329,17.29562,14.997106,False
693,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_6-part1,allrussian,incorrect,generic,0.0,,0.28162262,14.714638,14.49615,True
694,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_3,polish,incorrect,generic,0.0,,0.40262574,14.525775,12.458275,True
695,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_6-part1,elmosl,correct,generic,0.0,,0.18999948,16.282125,16.813683,False
696,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_11-part1,allrussian,incorrect,generic,0.0,,0.25978357,16.076544,15.01952,True
697,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_9-part1,matharena,detected,matharena,0.0,,0.19415632,16.429958,17.137634,False
698,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_10,allrussian,incorrect,generic,0.0,,0.19116172,17.303394,16.834812,True
699,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_18-part1,chinatst,incorrect,generic,0.0,,0.19617285,18.104887,16.280947,True
700,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_22-part1,matharena,correct,matharena,0.0,,0.23018658,17.359495,16.166124,False
701,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_1,romania,incorrect,generic,0.0,,0.24424393,15.451275,15.840352,True
702,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_4,turkey,incorrect,generic,0.0,,0.23239425,15.380439,15.269888,True
703,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_12-part1,chinatst,incorrect,generic,0.0,,0.22201143,16.609156,15.5621395,True
704,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_1,vietnam,detected,generic,0.0,,0.19922255,17.612274,17.142918,False
705,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_4,iran,incorrect,generic,0.0,,0.2696025,15.981685,14.963566,True
706,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_51,matharena,detected,matharena,0.0,,0.18981856,15.775169,16.237131,False
707,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_7-part1,israel,detected,generic,0.0,,0.31832558,16.302387,13.979381,False
708,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_1-part1,elmosl,incorrect,generic,0.0,,0.25831383,15.896422,14.75687,True
709,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_17-part1,matharena,incorrect,matharena,0.0,,0.2782006,15.142814,14.576079,True
710,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_28,matharena,correct,matharena,0.0,,0.26793063,17.184504,15.681409,False
711,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_46,matharena,detected,matharena,0.0,,0.30249578,13.12334,14.813451,False
712,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_9,matharena,detected,matharena,0.0,,0.1543857,16.872597,17.570662,False
713,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_11,chinatst,detected,generic,0.0,,0.21518975,18.196127,17.03134,False
714,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_25,imosl,detected,generic,0.0,,0.23931764,15.843752,15.759198,False
715,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_3,usamo,incorrect,generic,0.0,,0.37156424,15.214171,12.983648,True
716,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_18,imosl,correct,generic,0.0,,0.24111012,16.508625,16.077644,False
717,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_1,india,correct,generic,0.0,,0.14315815,17.814497,18.91202,False
718,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_3-part1,nordic,correct,generic,0.0,,0.26421663,16.274864,15.576381,False
719,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_2-part1,matharena,detected,matharena,0.0,,0.23789488,18.788885,15.836701,False
720,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_22,matharena,correct,matharena,0.0,,0.32107624,14.916389,14.004984,False
721,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_17,imosl,incorrect,generic,0.0,,0.24906339,17.086117,15.686729,True
722,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_8-part1,elmosl,incorrect,generic,0.0,,0.2634959,18.300343,15.424402,True
723,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_6-part1,chinatst,incorrect,generic,0.0,,0.25751606,16.175821,14.910912,True
724,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_6,bmosl,incorrect,generic,0.0,,0.2692938,14.960308,14.804682,True
725,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_25-part1,imosl,detected,generic,0.0,,0.2371907,16.028013,15.898606,False
726,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_25,matharena,correct,matharena,0.0,,0.21227579,18.89506,17.338737,False
727,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_25-part1,matharena,correct,matharena,0.0,,0.21976323,17.924843,17.022005,False
728,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_30-part1,matharena,incorrect,matharena,0.0,,0.22905517,15.071662,15.735607,True
729,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_tst_2025_3,romania,incorrect,generic,0.0,,0.2588842,17.668217,15.061078,True
730,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_27,matharena,correct,matharena,0.0,,0.20028928,15.981653,16.80115,False
731,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_30,matharena,incorrect,matharena,0.0,,0.24226004,16.21081,15.554089,True
732,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_8-part1,matharena,correct,matharena,0.0,,0.107992075,21.116726,19.901896,False
733,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_5-part1,izho,incorrect,generic,0.0,,0.16052882,17.334541,17.43327,True
734,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_2-part1,elmosl,incorrect,generic,0.0,,0.26675093,14.002359,15.16849,True
735,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_5,spain,detected,generic,0.0,,0.26490107,18.578295,15.009245,False
736,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_1-part1,matharena,incorrect,matharena,0.0,,0.22219521,20.370981,16.539358,True
737,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_43-part1,matharena,incorrect,matharena,0.0,,0.26666895,14.339498,15.000876,True
738,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_3,serbia,detected,generic,0.0,,0.17144008,17.342205,16.761677,False
739,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_1-part1,german,detected,generic,0.0,,0.24854212,18.833666,15.811723,False
740,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_2,pan,correct,generic,0.0,,0.25664976,17.444496,16.114162,False
741,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_5,matharena,incorrect,matharena,0.0,,0.22136584,15.729002,15.443274,True
742,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_5,canada,incorrect,generic,0.0,,0.3739533,14.238213,12.596228,True
743,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_5-part1,canada,incorrect,generic,0.0,,0.32111785,14.971785,13.951325,True
744,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_23,matharena,correct,matharena,0.0,,0.22359604,16.63083,16.06756,False
745,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_27,matharena,incorrect,matharena,0.0,,0.25861886,18.406134,15.745381,True
746,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_6-part1,izho,correct,generic,0.0,,0.258581,17.349781,15.1002035,False
747,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_8,chinatst,incorrect,generic,0.0,,0.18013796,17.214645,17.83535,True
748,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_21,chinatst,detected,generic,0.0,,0.37801978,16.904543,12.953305,False
749,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_29,matharena,correct,matharena,0.0,,0.1965919,16.823679,17.092955,False
750,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_26-part1,imosl,correct,generic,0.0,,0.259948,14.613169,14.846805,False
751,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_5,india,detected,generic,0.0,,0.31633052,15.337237,14.158459,False
752,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_2,matharena,correct,matharena,0.0,,0.17681299,17.991947,17.460873,False
753,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_20-part1,chinatst,incorrect,generic,0.0,,0.2628741,15.234044,15.181537,True
754,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_3,romania,detected,generic,0.0,,0.28560436,16.172873,14.348079,False
755,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_13,chinatst,incorrect,generic,0.0,,0.28659603,15.041125,14.253106,True
756,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_2,bulgaria,detected,generic,0.0,,0.35152504,15.463842,13.596492,False
757,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_3-part1,bmosl,incorrect,generic,0.0,,0.29946682,14.376446,13.8740635,True
758,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_3,matharena,detected,matharena,0.0,,0.24707507,17.230808,15.808257,False
759,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_4-part1,iran,incorrect,generic,0.0,,0.30618036,13.405749,14.193194,True
760,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_26,matharena,correct,matharena,0.0,,0.21410404,19.181648,16.880957,False
761,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_7-part1,elmosl,incorrect,generic,0.0,,0.347949,16.709604,13.7958555,True
762,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_39,matharena,detected,matharena,0.0,,0.23155576,17.25279,15.989057,False
763,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_25-part1,matharena,detected,matharena,0.0,,0.20907784,15.813536,16.876871,False
764,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_5,bmosl,detected,generic,0.0,,0.14618765,18.469135,18.274126,False
765,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_5,elmosl,incorrect,generic,0.0,,0.19486506,15.874182,16.616096,True
766,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_9-part1,usatst,incorrect,generic,0.0,,0.18983701,17.212948,17.306795,True
767,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_8,matharena,detected,matharena,0.0,,0.18490224,19.544216,17.116137,False
768,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_2-part1,usamo,incorrect,generic,0.0,,0.26056892,18.50277,15.241442,True
769,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_34,matharena,incorrect,matharena,0.0,,0.24837124,15.034522,14.807226,True
770,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_4,allrussian,incorrect,generic,0.0,,0.33715445,13.56039,13.745822,True
771,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_6-part1,usatst,incorrect,generic,0.0,,0.30543572,16.226513,14.773765,True
772,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_45,matharena,correct,matharena,0.0,,0.30036533,17.80954,14.914688,False
773,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_2-part1,chinatst,correct,generic,0.0,,0.22528534,15.218419,16.08939,False
774,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_3-part1,canada,incorrect,generic,0.0,,0.20791818,16.773373,15.953057,True
775,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_2-part1,china,incorrect,generic,0.0,,0.20973971,14.972125,16.629122,True
776,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_9-part1,matharena,detected,matharena,0.0,,0.22095796,20.132938,16.967064,False
777,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_30,matharena,incorrect,matharena,0.0,,0.25925237,15.469065,15.400136,True
778,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_3,china,incorrect,generic,0.0,,0.27673486,16.606588,14.402347,True
779,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_24,matharena,correct,matharena,0.0,,0.18399781,17.490797,16.760612,False
780,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_1,jbmo,detected,generic,0.0,,0.19558361,18.918526,17.294632,False
781,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_2-part1,bmosl,incorrect,generic,0.0,,0.17288998,15.01173,17.290203,True
782,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_20-part1,matharena,incorrect,matharena,0.0,,0.16821599,16.69296,17.692572,True
783,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_24-part1,imosl,incorrect,generic,0.0,,0.2014818,16.797384,15.850175,True
784,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_6,elmosl,incorrect,generic,0.0,,0.27630892,15.918172,14.515258,True
785,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_4-part1,bmosl,incorrect,generic,0.0,,0.22843127,16.779554,15.632772,True
786,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_12-part1,imosl,incorrect,generic,0.0,,0.34451246,14.867527,13.972598,True
787,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_3,rmm,incorrect,generic,0.0,,0.27185863,17.181984,14.823946,True
788,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_16-part1,chinatst,incorrect,generic,0.0,,0.14051497,17.559055,18.071548,True
789,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_16,matharena,incorrect,matharena,0.0,,0.22803195,17.671068,16.256607,True
790,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_20,imosl,incorrect,generic,0.0,,0.22301601,14.011326,15.914977,True
791,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_7,israel,incorrect,generic,0.0,,0.29323652,16.652714,14.290553,True
792,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_30-part1,matharena,incorrect,matharena,0.0,,0.20588662,17.53404,17.179081,True
793,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_38-part1,matharena,incorrect,matharena,0.0,,0.19618131,17.56187,16.55782,True
794,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_4,rmm,incorrect,generic,0.0,,0.2781022,17.34402,14.49401,True
795,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_2,korea,detected,generic,0.0,,0.29405507,13.836509,14.071749,False
796,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_3,india,incorrect,generic,0.0,,0.32444456,14.845087,13.909929,True
797,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_16-part1,matharena,incorrect,matharena,0.0,,0.24932656,13.1870985,15.828348,True
798,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_1,turkey,incorrect,generic,0.0,,0.33927405,14.84557,13.338863,True
799,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_14-part1,matharena,incorrect,matharena,0.0,,0.20843016,17.400597,16.0267,True
800,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_35-part1,matharena,correct,matharena,0.0,,0.16483349,17.563183,18.047125,False
801,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_6,korea,incorrect,generic,0.0,,0.23825498,17.064556,15.082645,True
802,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_10-part1,matharena,detected,matharena,0.0,,0.19400768,17.841042,16.779757,False
803,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_9_2025_2,romania,incorrect,generic,0.0,,0.19345017,16.629883,16.413147,True
804,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_4,greece,correct,generic,0.0,,0.22762316,14.355509,16.381098,False
805,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_2,bmosl,incorrect,generic,0.0,,0.18352547,15.124678,16.42027,True
806,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_12-part1,matharena,correct,matharena,0.0,,0.22901851,14.73488,15.878803,False
807,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_3,thai,detected,generic,0.0,,0.23938377,15.5246,15.752991,False
808,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_17-part1,india,incorrect,generic,0.0,,0.39785832,14.673289,12.995737,True
809,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_2-part1,india,incorrect,generic,0.0,,0.2416772,13.675076,15.762036,True
810,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_31-part1,imosl,incorrect,generic,0.0,,0.16951782,16.244993,16.270254,True
811,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_15,imosl,incorrect,generic,0.0,,0.255194,18.075186,14.764188,True
812,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_11,matharena,correct,matharena,0.0,,0.21404883,17.442623,16.54908,False
813,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_16-part1,matharena,correct,matharena,0.0,,0.23704669,18.592093,16.17423,False
814,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_10_2025_1-part1,romania,incorrect,generic,0.0,,0.24257515,15.024129,15.841394,True
815,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_5-part1,elmosl,incorrect,generic,0.0,,0.26916537,15.836871,14.385873,True
816,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_4-part1,india,incorrect,generic,0.0,,0.3087608,14.201736,14.193741,True
817,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_19-part1,matharena,correct,matharena,0.0,,0.15116496,16.101181,17.386677,False
818,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_3,elmosl,incorrect,generic,0.0,,0.1870022,15.819919,16.809605,True
819,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_2-part1,bmosl,incorrect,generic,0.0,,0.26162416,16.801819,15.028453,True
820,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_4,elmosl,incorrect,generic,0.0,,0.3209361,15.16131,13.88135,True
821,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_3-part1,israel,detected,generic,0.0,,0.24873085,17.241808,15.680181,False
822,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_29,matharena,detected,matharena,0.0,,0.16903004,17.768263,17.76137,False
823,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_26,matharena,correct,matharena,0.0,,0.25788897,17.489347,15.783027,False
824,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_26,imosl,correct,generic,0.0,,0.24828027,14.961988,14.920217,False
825,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_5-part1,vietnam,incorrect,generic,0.0,,0.31075394,13.378206,13.876242,True
826,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_8,imosl,incorrect,generic,0.0,,0.27783367,16.851067,14.390556,True
827,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_2,matharena,detected,matharena,0.0,,0.18595584,19.627783,18.159224,False
828,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_3,pan,corrected,generic,0.0,,0.17775391,20.039352,17.359863,False
829,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_6,india,incorrect,generic,0.0,,0.24307756,15.857153,14.993757,True
830,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_9-part1,israel,incorrect,generic,0.0,,0.29316348,16.70896,14.070917,True
831,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_2-part1,matharena,incorrect,matharena,0.0,,0.20713998,17.123083,16.385035,True
832,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_27,matharena,incorrect,matharena,0.0,,0.18887347,15.933023,17.257088,True
833,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_6-part1,rmm,incorrect,generic,0.0,,0.30955696,16.960377,13.998687,True
834,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_9-part1,iran,incorrect,generic,0.0,,0.22163321,16.267561,15.782679,True
835,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_3-part1,imosl,detected,generic,0.0,,0.1768993,21.261456,18.043503,False
836,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_2,bmosl,incorrect,generic,0.0,,0.20822902,17.342299,16.44092,True
837,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_13,matharena,incorrect,matharena,0.0,,0.32211804,14.434961,13.592381,True
838,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_6,elmosl,incorrect,generic,0.0,,0.17593043,17.836382,17.133175,True
839,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_15-part1,matharena,correct,matharena,0.0,,0.31345633,18.29859,14.74927,False
840,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_3,bmosl,detected,generic,0.0,,0.25939843,13.960983,15.367365,False
841,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_11-part1,matharena,incorrect,matharena,0.0,,0.2801108,15.597411,15.379763,True
842,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_24,matharena,incorrect,matharena,0.0,,0.2671653,15.727457,14.747948,True
843,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_5-part1,india,incorrect,generic,0.0,,0.24874467,16.785017,15.172521,True
844,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_5-part1,usatst,incorrect,generic,0.0,,0.24063644,14.855971,15.579335,True
845,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_8,matharena,incorrect,matharena,0.0,,0.17227094,19.788216,17.873987,True
846,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_20,matharena,incorrect,matharena,0.0,,0.27121073,14.288532,15.258012,True
847,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_33-part1,matharena,detected,matharena,0.0,,0.2761298,15.65298,14.93672,False
848,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_4-part1,matharena,correct,matharena,0.0,,0.188284,21.336718,17.563318,False
849,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_2,egmo,incorrect,generic,0.0,,0.16431405,20.143356,17.706575,True
850,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_13-part1,matharena,incorrect,matharena,0.0,,0.2815268,15.120756,14.756952,True
851,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_17-part1,matharena,correct,matharena,0.0,,0.26275223,16.002525,15.073565,False
852,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_1,romania,incorrect,generic,0.0,,0.19573188,17.766855,15.900409,True
853,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_16,matharena,correct,matharena,0.0,,0.26601392,17.700783,15.481069,False
854,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_3,izho,incorrect,generic,0.0,,0.27978584,14.807789,14.12989,True
855,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_22-part1,matharena,incorrect,matharena,0.0,,0.30861923,15.183253,14.100644,True
856,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_50-part1,matharena,correct,matharena,0.0,,0.21252745,20.423504,16.619064,False
857,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_1,bmo,incorrect,generic,0.0,,0.28497466,13.832384,14.346973,True
858,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_2,matharena,correct,matharena,0.0,,0.25113615,15.335704,15.609639,False
859,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_4,polish,detected,generic,0.0,,0.23489456,15.036098,15.938559,False
860,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_35-part1,imosl,incorrect,generic,0.0,,0.1950534,17.086895,17.071552,True
861,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_10,matharena,correct,matharena,0.0,,0.32256323,14.469872,13.662992,False
862,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_14,chinatst,incorrect,generic,0.0,,0.24388315,14.433877,15.5452585,True
863,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_8,matharena,detected,matharena,0.0,,0.22218911,16.690664,16.035143,False
864,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_18,india,incorrect,generic,0.0,,0.22996178,14.683774,15.781924,True
865,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_20-part1,matharena,detected,matharena,0.0,,0.20713627,14.880448,16.312016,False
866,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_7-part1,turkey,incorrect,generic,0.0,,0.20772824,16.0962,16.605902,True
867,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_NT_2025_3-part1,elmosl,incorrect,generic,0.0,,0.12672208,19.511108,18.920914,True
868,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_12,matharena,correct,matharena,0.0,,0.25610027,15.911022,15.813997,False
869,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_1,iran,incorrect,generic,0.0,,0.21145621,16.098724,15.9990635,True
870,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_30-part1,matharena,incorrect,matharena,0.0,,0.26767302,15.998343,14.9899845,True
871,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_22-part1,matharena,incorrect,matharena,0.0,,0.27042362,15.848984,14.844941,True
872,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_15,matharena,detected,matharena,0.0,,0.24027015,14.960989,15.0061245,False
873,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_7-part1,bmosl,detected,generic,0.0,,0.27170855,14.839524,15.258849,False
874,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_17,chinatst,incorrect,generic,0.0,,0.27294433,17.732977,14.940449,True
875,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_6-part1,usamo,incorrect,generic,0.0,,0.3626081,14.303678,13.229658,True
876,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_2-part1,india,incorrect,generic,0.0,,0.25273308,21.990286,16.76328,True
877,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_11-part1,imosl,incorrect,generic,0.0,,0.35123995,13.830077,13.562425,True
878,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_6,matharena,detected,matharena,0.0,,0.18393335,15.076329,17.241673,False
879,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_22,chinatst,incorrect,generic,0.0,,0.21319778,15.837504,16.014908,True
880,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_2-part1,bmo,incorrect,generic,0.0,,0.26252997,16.919434,15.298189,True
881,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_3,elmosl,incorrect,generic,0.0,,0.24519186,17.286438,15.550263,True
882,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_28,matharena,incorrect,matharena,0.0,,0.20945568,16.384468,15.941618,True
883,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_5,philippines,detected,generic,0.0,,0.2128092,17.415136,16.175024,False
884,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_40-part1,matharena,incorrect,matharena,0.0,,0.38487086,14.473153,13.023415,True
885,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_8,usatst,incorrect,generic,0.0,,0.25182924,15.4669075,15.236116,True
886,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_18-part1,matharena,correct,matharena,0.0,,0.20056567,19.083286,17.227089,False
887,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,jbmo_2025_3-part1,jbmo,incorrect,generic,0.0,,0.21010023,16.904934,16.578888,True
888,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_2,bmosl,incorrect,generic,0.0,,0.24767369,17.649868,15.265082,True
889,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_13,matharena,incorrect,matharena,0.0,,0.3394159,14.584427,13.693955,True
890,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_13-part1,chinatst,incorrect,generic,0.0,,0.32661876,13.545978,13.783257,True
891,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_A_2025_1,elmosl,incorrect,generic,0.0,,0.24740033,17.508135,15.159135,True
892,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_2-part1,imosl,correct,generic,0.0,,0.19892251,17.278683,16.852825,False
893,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_32-part1,imosl,incorrect,generic,0.0,,0.3060152,15.790119,14.27757,True
894,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_24,matharena,detected,matharena,0.0,,0.20441335,17.232405,16.743317,False
895,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_3-part1,polish,incorrect,generic,0.0,,0.38406238,14.366094,12.6336565,True
896,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,german_2025_2-part1,german,incorrect,generic,0.0,,0.25496918,15.93591,15.28925,True
897,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_37,imosl,incorrect,generic,0.0,,0.25300992,16.941782,15.0698595,True
898,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_4,thai,detected,generic,0.0,,0.19010708,13.913624,17.441004,False
899,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_C_2025_4,elmosl,incorrect,generic,0.0,,0.25251746,16.129498,15.224104,True
900,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_15-part1,chinatst,incorrect,generic,0.0,,0.22164258,17.579123,15.946798,True
901,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,pan_african_2025_4,pan,correct,generic,0.0,,0.21082097,19.19785,16.720684,False
902,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,polish_2025_2-part1,polish,incorrect,generic,0.0,,0.1631527,18.6934,17.033531,True
903,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_21,india,detected,generic,0.0,,0.23741232,16.593311,15.590728,False
904,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,egmo_2025_6,egmo,incorrect,generic,0.0,,0.2657176,17.903439,14.917104,True
905,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_4-part1,izho,incorrect,generic,0.0,,0.18939553,18.37019,16.995422,True
906,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,serbia_tst_bmo_2025_2,serbia,correct,generic,0.0,,0.25676194,17.115604,15.238237,False
907,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,spain_2025_2,spain,incorrect,generic,0.0,,0.23281597,16.744774,15.861095,True
908,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_17,india,incorrect,generic,0.0,,0.3966499,13.953903,12.867683,True
909,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,canada_2025_4,canada,incorrect,generic,0.0,,0.25989386,14.713198,15.077298,True
910,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_16,allrussian,incorrect,generic,0.0,,0.32046682,16.10736,13.896985,True
911,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_6,vietnam,detected,generic,0.0,,0.25420678,19.897644,16.15822,False
912,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_6-part1,philippines,detected,generic,0.0,,0.26271915,14.724385,14.577746,False
913,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,korea_2025_2-part1,korea,incorrect,generic,0.0,,0.3060539,14.293313,14.027396,True
914,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_7-part1,allrussian,incorrect,generic,0.0,,0.23605499,16.873407,15.437823,True
915,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_19,matharena,correct,matharena,0.0,,0.22147505,17.44491,16.112562,False
916,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_1,matharena,correct,matharena,0.0,,0.21050544,18.127783,16.983475,False
917,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_4,vietnam,incorrect,generic,0.0,,0.2342547,14.861821,15.884339,True
918,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_17,matharena,incorrect,matharena,0.0,,0.20802914,15.814106,15.833964,True
919,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_4-part1,usamo,incorrect,generic,0.0,,0.23829456,14.5288105,15.622923,True
920,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_3-part1,india,incorrect,generic,0.0,,0.32713923,14.233764,14.181605,True
921,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,izho_2025_2,izho,incorrect,generic,0.0,,0.33096352,13.606361,13.669414,True
922,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_2-part1,matharena,detected,matharena,0.0,,0.19623776,19.25939,17.670578,False
923,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_28-part1,matharena,incorrect,matharena,0.0,,0.21005411,17.005554,16.06033,True
924,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,rmm_2025_2-part1,rmm,incorrect,generic,0.0,,0.23732062,15.253232,14.945595,True
925,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_23-part1,matharena,correct,matharena,0.0,,0.20377903,18.310898,16.893692,False
926,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_41-part1,matharena,incorrect,matharena,0.0,,0.19661328,18.203424,16.972565,True
927,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_5,iran,incorrect,generic,0.0,,0.21000251,16.035133,15.789062,True
928,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_18-part1,matharena,incorrect,matharena,0.0,,0.338766,12.809894,13.735127,True
929,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_5-part1,israel,incorrect,generic,0.0,,0.26359698,16.691309,14.581192,True
930,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_21,matharena,correct,matharena,0.0,,0.27954248,19.179897,15.49765,False
931,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,elmosl_G_2025_7-part1,elmosl,detected,generic,0.0,,0.1099352,20.385715,20.17006,False
932,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,nordic_2025_1,nordic,correct,generic,0.0,,0.28474414,16.544409,14.780697,False
933,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_prep_2025_20-part1,india,incorrect,generic,0.0,,0.24993221,16.58975,15.049535,True
934,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_5-part1,matharena,correct,matharena,0.0,,0.19022527,18.231747,16.771034,False
935,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_8-part1,israel,incorrect,generic,0.0,,0.21453181,16.184134,15.987286,True
936,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_5,bmosl,incorrect,generic,0.0,,0.34261018,14.613852,13.581224,True
937,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_24-part1,matharena,detected,matharena,0.0,,0.15444686,17.562107,17.439594,False
938,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_7,matharena,correct,matharena,0.0,,0.26221475,19.280773,15.789772,False
939,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_36,imosl,incorrect,generic,0.0,,0.2370612,16.337894,15.800651,True
940,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_4-part1,imosl,incorrect,generic,0.0,,0.20953923,16.42384,15.953214,True
941,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_25,matharena,correct,matharena,0.0,,0.20869079,20.351648,16.7153,False
942,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_4,imosl,incorrect,generic,0.0,,0.19914185,15.948434,16.398424,True
943,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_11_2025_2,romania,detected,generic,0.0,,0.2766123,17.435825,14.827998,False
944,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_2,allrussian,incorrect,generic,0.0,,0.22123986,16.663052,15.947655,True
945,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_3-part1,china,incorrect,generic,0.0,,0.19860163,16.171167,16.211575,True
946,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_22-part1,matharena,incorrect,matharena,0.0,,0.23492736,15.614035,16.298367,True
947,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,israel_tst_2025_9,israel,incorrect,generic,0.0,,0.30628213,14.92163,13.86742,True
948,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmo_2025_1-part1,bmo,incorrect,generic,0.0,,0.2709054,14.271592,14.920338,True
949,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_17,matharena,detected,matharena,0.0,,0.23136611,17.631405,15.825924,False
950,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_32,matharena,incorrect,matharena,0.0,,0.3258426,13.614625,13.735905,True
951,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_3,philippines,detected,generic,0.0,,0.22633293,14.779166,15.561097,False
952,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_38-part1,matharena,incorrect,matharena,0.0,,0.21914676,15.038152,16.060734,True
953,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_G_2025_3-part1,bmosl,detected,generic,0.0,,0.17121387,14.36119,17.66259,False
954,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_7-part1,iran,incorrect,generic,0.0,,0.19963603,18.582958,16.281445,True
955,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_2-part1,bulgaria,incorrect,generic,0.0,,0.35308477,15.506502,13.484381,True
956,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_14-part1,matharena,incorrect,matharena,0.0,,0.30814663,13.551792,14.633553,True
957,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,philippines_2025_2-part1,philippines,incorrect,generic,0.0,,0.21807443,16.055126,15.87541,True
958,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_8,turkey,incorrect,generic,0.0,,0.21776453,15.632717,15.634326,True
959,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_29-part1,matharena,incorrect,matharena,0.0,,0.16364147,20.718409,17.716827,True
960,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_19-part1,matharena,correct,matharena,0.0,,0.20189819,17.775953,16.451622,False
961,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_19-part1,matharena,incorrect,matharena,0.0,,0.27109447,13.920861,15.09522,True
962,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_5-part1,matharena,correct,matharena,0.0,,0.24506305,16.358768,16.219488,False
963,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_brumo_brumo_2025_22,matharena,incorrect,matharena,0.0,,0.2598971,14.122216,15.447706,True
964,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_6,usatst,incorrect,generic,0.0,,0.3321949,15.86299,14.443704,True
965,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_33-part1,imosl,incorrect,generic,0.0,,0.15324838,18.957699,18.085297,True
966,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_36,matharena,correct,matharena,0.0,,0.20930749,18.072783,16.829168,False
967,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_22,matharena,incorrect,matharena,0.0,,0.19703186,18.505392,16.7019,True
968,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,greece_2025_3,greece,detected,generic,0.0,,0.1719016,17.477228,17.6199,False
969,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_52,matharena,correct,matharena,0.0,,0.20121117,17.373638,16.74826,False
970,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_6-part1,vietnam,detected,generic,0.0,,0.2522879,14.774896,15.839428,False
971,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_6-part1,thai,incorrect,generic,0.0,,0.25046614,15.4031515,14.834976,True
972,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_28,matharena,correct,matharena,0.0,,0.15258361,16.920685,17.623446,False
973,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,china_2025_1-part1,china,incorrect,generic,0.0,,0.2881086,14.121996,14.27,True
974,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_5,turkey,incorrect,generic,0.0,,0.281235,17.862532,14.674664,True
975,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_4-part1,vietnam,incorrect,generic,0.0,,0.24471137,13.603633,15.848057,True
976,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_6-part1,bmosl,correct,generic,0.0,,0.28112355,17.210861,14.40564,False
977,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_6,imosl,incorrect,generic,0.0,,0.17875469,17.82426,16.764341,True
978,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_15,matharena,incorrect,matharena,0.0,,0.32669345,13.49482,13.604663,True
979,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,vietnam_2025_2-part1,vietnam,correct,generic,0.0,,0.21057479,16.564608,16.875746,False
980,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_4-part1,bmosl,incorrect,generic,0.0,,0.2642667,18.623657,15.484089,True
981,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_6-part1,matharena,incorrect,matharena,0.0,,0.18974982,16.978481,16.72861,True
982,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_3-part1,bulgaria,detected,generic,0.0,,0.316944,14.289478,13.677962,False
983,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_C_2025_4,bmosl,incorrect,generic,0.0,,0.18335383,21.441711,17.694553,True
984,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_16,chinatst,incorrect,generic,0.0,,0.20778207,17.833965,16.466864,True
985,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,romania_12_2025_2,romania,correct,generic,0.0,,0.14393093,20.663687,18.431509,False
986,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_26-part1,matharena,incorrect,matharena,0.0,,0.319344,16.527748,14.5501175,True
987,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,iran_tst_2025_5-part1,iran,incorrect,generic,0.0,,0.30467832,14.232577,13.917698,True
988,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,india_2025_4-part1,india,incorrect,generic,0.0,,0.25253925,18.915394,15.317577,True
989,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,chinatst_2025_10-part1,chinatst,incorrect,generic,0.0,,0.29467282,17.753952,14.347097,True
990,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_22-part1,imosl,incorrect,generic,0.0,,0.23335811,16.239004,15.460704,True
991,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,allrussian_2025_6,allrussian,incorrect,generic,0.0,,0.33130905,15.295052,14.032646,True
992,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_4-part1,bmosl,detected,generic,0.0,,0.21329789,17.043724,16.411343,False
993,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,turkey_tst_2025_2-part1,turkey,incorrect,generic,0.0,,0.2428025,16.038836,15.530504,True
994,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_cmimc_cmimc_2025_40,matharena,incorrect,matharena,0.0,,0.3883606,14.961722,12.984404,True
995,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_A_2025_1-part1,bmosl,incorrect,generic,0.0,,0.12942185,18.97925,18.42895,True
996,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_21-part1,imosl,incorrect,generic,0.0,,0.28270185,16.379078,14.429285,True
997,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_aime_aime_2025_23,matharena,detected,matharena,0.0,,0.19758463,14.581482,16.433477,False
998,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,thai_2025_2,thai,correct,generic,0.0,,0.3075192,16.035582,14.068636,False
999,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usatst_2025_5,usatst,incorrect,generic,0.0,,0.25105095,14.024373,15.347409,True
1000,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_6,bmosl,incorrect,generic,0.0,,0.22473046,13.685711,16.15209,True
1001,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,imosl_2025_19-part1,imosl,incorrect,generic,0.0,,0.30513313,15.239545,14.093074,True
1002,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_24-part1,matharena,correct,matharena,0.0,,0.28335562,15.1877575,14.535442,False
1003,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,usamo_2025_1-part1,usamo,incorrect,generic,0.0,,0.27928242,16.176722,14.740055,True
1004,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bulgaria_2025_1-part1,bulgaria,corrected,generic,0.0,,0.20040502,20.637323,16.717104,False
1005,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_smt_smt_2025_18,matharena,detected,matharena,0.0,,0.2019247,16.272408,16.769814,False
1006,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,bmosl_NT_2025_1,bmosl,detected,generic,0.0,,0.25934333,16.078966,15.364145,False
1007,GPT-5-mini (medium),Qwen3-4B (25/07),qwen/qwen3_4b_logprobs,matharena_hmmt_hmmt_feb_2025_16,matharena,detected,matharena,0.0,,0.3226043,14.13142,13.960188,False
