,Dataset,Number of sampled points,Fraction of sampled points,Number of source models,Technique,Mean estimation error against seen accuracies,Mean estimation error against unseen accuracies,Kendall tau correlation against seen accuracies,Kendall tau correlation against unseen accuracies
0,mmlu-pro_all,10,10,300,Random (uniform),0.08644585828343314,0.08760382059800664,0.3019298199277709,0.28575028575042866
1,mmlu-pro_all,10,10,300,Random (uniform),0.08519315535595476,0.08441133720930233,0.4371971167757021,0.4305416665554739
2,mmlu-pro_all,10,10,300,Random (uniform),0.13247359447771123,0.1372248754152824,-0.17796503660044974,-0.17796503660044974
3,mmlu-pro_all,10,10,300,Random (uniform),0.15168309214903525,0.15130294850498338,0.007449146922880655,-0.03724573461440328
4,mmlu-pro_all,10,10,300,Random (uniform),0.10889367099135064,0.10656665282392028,0.23756435900764947,0.25780642452763314
5,mmlu-pro_all,10,10,300,Random (uniform),0.07497816866267465,0.07376972591362127,0.4099756287561005,0.41945483404525896
6,mmlu-pro_all,10,10,300,Random (uniform),0.08413693446440453,0.08501349667774086,0.5584958036762804,0.5584958036762804
7,mmlu-pro_all,10,10,300,Random (uniform),0.21008711743180308,0.21111399501661132,-0.08870373285939512,-0.08870373285939512
8,mmlu-pro_all,10,10,300,Random (uniform),0.08011788922155688,0.08064264950166114,0.17923428880780454,0.1790535179208545
9,mmlu-pro_all,10,10,300,Random (uniform),0.12505925648702593,0.12518168604651164,0.48532191389358514,0.4653805495127296
10,mmlu-pro_all,10,10,300,Random (uniform),0.13267943280106453,0.131172134551495,0.3366253999049505,0.3482728128637493
11,mmlu-pro_all,10,10,300,Random (uniform),0.10393795741849635,0.10468230897009967,0.24889143647031806,0.2533759668571707
12,mmlu-pro_all,10,10,300,Random (uniform),0.06954216566866268,0.07135589700996678,0.4423913461617549,0.4582038768746305
13,mmlu-pro_all,10,10,300,Random (uniform),0.11006944444444444,0.10843023255813954,0.4061477708054118,0.3971222647875138
14,mmlu-pro_all,10,10,300,Random (uniform),0.15359489354624084,0.15854443521594683,-0.0653383972553552,-0.034967410019886035
15,mmlu-pro_all,10,10,300,Random (uniform),0.09586867930805057,0.09567068106312293,0.40079494928024056,0.40079494928024056
16,mmlu-pro_all,10,10,300,Random (uniform),0.12809173320026612,0.12278862126245849,0.43936963681850183,0.4301681784558106
17,mmlu-pro_all,10,10,300,Random (uniform),0.08736797238855623,0.0857610049833887,0.6711856914426312,0.6511834688499699
18,mmlu-pro_all,10,10,300,Random (uniform),0.12470787591483701,0.12310527408637872,0.5313806736351204,0.5198768708934682
19,mmlu-pro_all,10,10,300,Random (uniform),0.14156894544244844,0.13965946843853821,0.14801216367625056,0.16210856021684583
20,mmlu-pro_all,25,25,300,Random (uniform),0.07935878243512973,0.07978405315614617,0.5189216747199441,0.5439982205242869
21,mmlu-pro_all,25,25,300,Random (uniform),0.057027819361277446,0.05797446013289036,0.5655530762689261,0.5810220551906948
22,mmlu-pro_all,25,25,300,Random (uniform),0.05915647870924817,0.06243666943521595,0.5433581647846062,0.5302124027333657
23,mmlu-pro_all,25,25,300,Random (uniform),0.1315783017298736,0.1318656561461794,0.5302124027333657,0.5302124027333657
24,mmlu-pro_all,25,25,300,Random (uniform),0.047658225216234196,0.050860672757475085,0.390283239559475,0.39289702334509585
25,mmlu-pro_all,25,25,300,Random (uniform),0.06092336161011311,0.06132163621262458,0.4093174585032381,0.40503141181734026
26,mmlu-pro_all,25,25,300,Random (uniform),0.12312603958749169,0.12397113787375416,0.5233990674488329,0.5403512639653943
27,mmlu-pro_all,25,25,300,Random (uniform),0.07016820525615436,0.07034572259136213,0.6034931268990228,0.5819398009383434
28,mmlu-pro_all,25,25,300,Random (uniform),0.04942053393213573,0.0477688953488372,0.6449670943550024,0.6507383579882047
29,mmlu-pro_all,25,25,300,Random (uniform),0.07210849135063208,0.07292877906976744,0.5532834206506881,0.5603070926572045
30,mmlu-pro_all,25,25,300,Random (uniform),0.06004553393213573,0.06069248338870432,0.45237375494393695,0.45505022477821844
31,mmlu-pro_all,25,25,300,Random (uniform),0.05316907850964737,0.051231312292358805,0.36107885619031993,0.36983228300705495
32,mmlu-pro_all,25,25,300,Random (uniform),0.04668205256154358,0.04869186046511627,0.6249493906635065,0.615202586430056
33,mmlu-pro_all,25,25,300,Random (uniform),0.06535678642714571,0.06869808970099668,0.5317548828821789,0.5446720055432844
34,mmlu-pro_all,25,25,300,Random (uniform),0.13228376580172985,0.13960132890365448,0.292939552396285,0.33955651464352327
35,mmlu-pro_all,25,25,300,Random (uniform),0.10732119095143047,0.10838247508305648,0.2527141685011451,0.2527141685011451
36,mmlu-pro_all,25,25,300,Random (uniform),0.08059672322022624,0.08494186046511629,0.1617464121700078,0.1704894614764947
37,mmlu-pro_all,25,25,300,Random (uniform),0.06428164504324684,0.06553882890365448,0.6221942795467679,0.651923699047296
38,mmlu-pro_all,25,25,300,Random (uniform),0.04912529108449767,0.05004256644518272,0.49868704173075723,0.5177633663447996
39,mmlu-pro_all,25,25,300,Random (uniform),0.05026447105788423,0.04929401993355481,0.400398336773943,0.38289458434666684
40,mmlu-pro_all,50,50,300,Random (uniform),0.05661136061210911,0.056370431893687706,0.6053062563336344,0.5983527981855589
41,mmlu-pro_all,50,50,300,Random (uniform),0.03759044411177645,0.03696947674418605,0.5461005461008192,0.5530083017624435
42,mmlu-pro_all,50,50,300,Random (uniform),0.0766028359946773,0.08067171926910299,0.4441563942091097,0.4525764206396141
43,mmlu-pro_all,50,50,300,Random (uniform),0.051103002328675984,0.05062188538205979,0.4631749813206516,0.47574986769134797
44,mmlu-pro_all,50,50,300,Random (uniform),0.038854582501663334,0.03908534053156146,0.4788901649369573,0.5068267647393445
45,mmlu-pro_all,50,50,300,Random (uniform),0.03492327844311378,0.035987333887043185,0.6461968353638984,0.6337699731453618
46,mmlu-pro_all,50,50,300,Random (uniform),0.0526145625415835,0.05326515780730896,0.65456538736543,0.6462003344917504
47,mmlu-pro_all,50,50,300,Random (uniform),0.04239624916833001,0.04275228405315615,0.2684962483847531,0.25997255795984026
48,mmlu-pro_all,50,50,300,Random (uniform),0.045396914504324676,0.0435953073089701,0.37797998528983584,0.38186541754449943
49,mmlu-pro_all,50,50,300,Random (uniform),0.03944215735196274,0.04041009136212624,0.7481638723432961,0.7427126993264608
50,mmlu-pro_all,50,50,300,Random (uniform),0.03364250665335995,0.03450477574750831,0.4267558540214519,0.42071417724614746
51,mmlu-pro_all,50,50,300,Random (uniform),0.0633104624085163,0.06006644518272425,0.6531054294476514,0.6240785214722001
52,mmlu-pro_all,50,50,300,Random (uniform),0.04296490352628077,0.04269310631229235,0.5556886929988458,0.5715005715008573
53,mmlu-pro_all,50,50,300,Random (uniform),0.06628035595475715,0.06260797342192691,0.5910667488825619,0.5995713064204407
54,mmlu-pro_all,50,50,300,Random (uniform),0.04588157019294744,0.0437437707641196,0.5809097228706317,0.5782438191973897
55,mmlu-pro_all,50,50,300,Random (uniform),0.05008982035928143,0.05115656146179401,0.508615976298366,0.49194004264923924
56,mmlu-pro_all,50,50,300,Random (uniform),0.041403858948769134,0.04488372093023256,0.5279951740278936,0.5195472512434473
57,mmlu-pro_all,50,50,300,Random (uniform),0.02568051397205588,0.02737230066445183,0.7224754665039204,0.7035734920895736
58,mmlu-pro_all,50,50,300,Random (uniform),0.11028172821024618,0.11326515780730896,0.5321681029763118,0.5253024276180721
59,mmlu-pro_all,50,50,300,Random (uniform),0.03401613439787092,0.03356312292358803,0.5484641294075125,0.5652111257252993
60,mmlu-pro_all,100,100,300,Random (uniform),0.04354041916167664,0.04539451827242525,0.6749516755366745,0.655825835783953
61,mmlu-pro_all,100,100,300,Random (uniform),0.021882900864936793,0.02093230897009967,0.6558783838258452,0.6378416131835474
62,mmlu-pro_all,100,100,300,Random (uniform),0.04428289254823686,0.040270971760797344,0.6749744023301398,0.6708712144740295
63,mmlu-pro_all,100,100,300,Random (uniform),0.023268255156353956,0.024052117940199333,0.7264475770222969,0.709843060976073
64,mmlu-pro_all,100,100,300,Random (uniform),0.03562104956753159,0.034047965116279076,0.797627740268968,0.7759131127083069
65,mmlu-pro_all,100,100,300,Random (uniform),0.036135021623419826,0.03722695182724252,0.6900562453969318,0.673626334792243
66,mmlu-pro_all,100,100,300,Random (uniform),0.04863065535595476,0.05018064784053156,0.5875820502850492,0.5957713471531334
67,mmlu-pro_all,100,100,300,Random (uniform),0.03190015801729873,0.03212105481727574,0.6613039018387263,0.6530889465363819
68,mmlu-pro_all,100,100,300,Random (uniform),0.02178206087824351,0.019934593023255814,0.7302996737109768,0.731629864114715
69,mmlu-pro_all,100,100,300,Random (uniform),0.027474634065202928,0.028594269102990035,0.8514114801428816,0.837895401430781
70,mmlu-pro_all,100,100,300,Random (uniform),0.025108948769128413,0.02570390365448505,0.7598833654668595,0.7832644720453654
71,mmlu-pro_all,100,100,300,Random (uniform),0.03557364437791084,0.0332921511627907,0.7326654013967792,0.7244332058754671
72,mmlu-pro_all,100,100,300,Random (uniform),0.03966546074517631,0.0396750415282392,0.7349493476308676,0.7538969087270346
73,mmlu-pro_all,100,100,300,Random (uniform),0.04138868097139055,0.03834198504983388,0.5606837883572845,0.573097451568147
74,mmlu-pro_all,100,100,300,Random (uniform),0.02212013473053892,0.022582018272425248,0.6655626443482177,0.6545024480053392
75,mmlu-pro_all,100,100,300,Random (uniform),0.0305439121756487,0.03121470099667774,0.6957584543323031,0.6998875846250765
76,mmlu-pro_all,100,100,300,Random (uniform),0.023773078842315377,0.026651785714285715,0.7207580086751173,0.7083311464565809
77,mmlu-pro_all,100,100,300,Random (uniform),0.03311793080505655,0.037373338870431885,0.6366362118714268,0.5996702382788924
78,mmlu-pro_all,100,100,300,Random (uniform),0.029681678310046568,0.03184073920265781,0.7013359472748474,0.673760835314681
79,mmlu-pro_all,100,100,300,Random (uniform),0.03874688123752495,0.03780419435215947,0.4783731506252814,0.4492547849350469
80,mmlu-pro_all,250,250,300,Random (uniform),0.01837840984697272,0.017691029900332226,0.7939188580480351,0.8113740489243785
81,mmlu-pro_all,250,250,300,Random (uniform),0.018561917831004658,0.020064576411960126,0.7620692329408778,0.7526128928115479
82,mmlu-pro_all,250,250,300,Random (uniform),0.02765564703925482,0.03324356312292359,0.7063268548722716,0.7104215033063138
83,mmlu-pro_all,250,250,300,Random (uniform),0.02486082002661344,0.0241281146179402,0.6790061788956343,0.6912773748997721
84,mmlu-pro_all,250,250,300,Random (uniform),0.02345388389886892,0.022731935215946845,0.7533185153306292,0.7357860362050523
85,mmlu-pro_all,250,250,300,Random (uniform),0.01921236693280107,0.018465739202657813,0.7752105184721916,0.7711411719185316
86,mmlu-pro_all,250,250,300,Random (uniform),0.014419619095143048,0.014550872093023257,0.9091710687900554,0.8764817719346825
87,mmlu-pro_all,250,250,300,Random (uniform),0.012735237857618101,0.01402637043189369,0.8221701322772439,0.8058085376050599
88,mmlu-pro_all,250,250,300,Random (uniform),0.01408828176979375,0.014492317275747513,0.8065440763049391,0.7955572512404905
89,mmlu-pro_all,250,250,300,Random (uniform),0.024375124750498998,0.024001453488372092,0.7551586562489401,0.7457059084820709
90,mmlu-pro_all,250,250,300,Random (uniform),0.01914076014637392,0.018635589700996685,0.7604963329503927,0.7490185879886804
91,mmlu-pro_all,250,250,300,Random (uniform),0.04099866932801065,0.03570681063122924,0.7759358782509882,0.767746581382904
92,mmlu-pro_all,250,250,300,Random (uniform),0.03032443446440452,0.031273671096345504,0.8728495415862553,0.867026970474875
93,mmlu-pro_all,250,250,300,Random (uniform),0.01419219893546241,0.016545265780730893,0.8000599817556338,0.8041419204380603
94,mmlu-pro_all,250,250,300,Random (uniform),0.013929557551563533,0.013455149501661129,0.8374139417582364,0.8142337591579037
95,mmlu-pro_all,250,250,300,Random (uniform),0.017573436460412515,0.017633305647840533,0.7595572845148196,0.7800305266850304
96,mmlu-pro_all,250,250,300,Random (uniform),0.04764254823685961,0.0416906146179402,0.7734509108767863,0.7775540987328965
97,mmlu-pro_all,250,250,300,Random (uniform),0.01633038090485696,0.015585755813953488,0.79147479830402,0.7874055191353617
98,mmlu-pro_all,250,250,300,Random (uniform),0.010578967065868264,0.01224397840531562,0.8894357712032899,0.8578288469318313
99,mmlu-pro_all,250,250,300,Random (uniform),0.014846141051230874,0.014968853820598003,0.8142236881771407,0.8101627221513197
100,mmlu-pro_all,500,500,300,Random (uniform),0.014008191949434468,0.013389327242524921,0.8056696669149066,0.8028348361450918
101,mmlu-pro_all,500,500,300,Random (uniform),0.010043745841650034,0.011710963455149506,0.8715914719862331,0.8744957188624111
102,mmlu-pro_all,500,500,300,Random (uniform),0.010145043246839645,0.011019102990033216,0.9113722810424276,0.8950613677351806
103,mmlu-pro_all,500,500,300,Random (uniform),0.011207834331337329,0.01212583056478406,0.8438290108813535,0.8073171786797565
104,mmlu-pro_all,500,500,300,Random (uniform),0.017171407185628745,0.014388704318936874,0.8125737497672771,0.8073088744425697
105,mmlu-pro_all,500,500,300,Random (uniform),0.013182052561543579,0.013639119601328905,0.8513679733184122,0.8554415042912277
106,mmlu-pro_all,500,500,300,Random (uniform),0.010243804058549574,0.011448712624584715,0.8294656089802268,0.8294656089802268
107,mmlu-pro_all,500,500,300,Random (uniform),0.014494594145043248,0.0157578903654485,0.8199733305624493,0.7833492115795112
108,mmlu-pro_all,500,500,300,Random (uniform),0.01462391882900865,0.014042358803986703,0.8351874845687282,0.808082457062099
109,mmlu-pro_all,500,500,300,Random (uniform),0.015160720226214237,0.015989410299003325,0.8271018840890239,0.838058397243144
110,mmlu-pro_all,500,500,300,Random (uniform),0.01094365435795076,0.012981935215946844,0.8810135288673462,0.8635724453366015
111,mmlu-pro_all,500,500,300,Random (uniform),0.015447272122421828,0.016453903654485052,0.7975773867843686,0.7975773867843686
112,mmlu-pro_all,500,500,300,Random (uniform),0.01405343479707253,0.015255191029900337,0.7829630711582366,0.7732856897249463
113,mmlu-pro_all,500,500,300,Random (uniform),0.013887558216899538,0.011571013289036543,0.8357152703921098,0.8316584001474879
114,mmlu-pro_all,500,500,300,Random (uniform),0.016859905189620758,0.022885589700996675,0.8158904733159691,0.8211653506153895
115,mmlu-pro_all,500,500,300,Random (uniform),0.011456711576846306,0.011841569767441862,0.8704524779626359,0.8421121647266431
116,mmlu-pro_all,500,500,300,Random (uniform),0.01625045741849634,0.013427117940199339,0.864113362104463,0.8519427513705974
117,mmlu-pro_all,500,500,300,Random (uniform),0.014629158349966738,0.01104609634551495,0.8357066740460772,0.8681613021643715
118,mmlu-pro_all,500,500,300,Random (uniform),0.011036260811709915,0.012100913621262461,0.8501886852920231,0.82692678748152
119,mmlu-pro_all,500,500,300,Random (uniform),0.025840069860279445,0.024155730897009967,0.8542527624072627,0.8583013536982924
120,mmlu-pro_all,1000,1000,300,Random (uniform),0.005802686294078505,0.007578280730897008,0.9362997708361287,0.9171735887654822
121,mmlu-pro_all,1000,1000,300,Random (uniform),0.0070817323685961435,0.009078176910299,0.9301031200578059,0.9046727587974525
122,mmlu-pro_all,1000,1000,300,Random (uniform),0.01636583083832336,0.012477470930232555,0.9190358720814805,0.9109386397283399
123,mmlu-pro_all,1000,1000,300,Random (uniform),0.009503534597471726,0.00977990033222592,0.8929311150536193,0.8969715273389297
124,mmlu-pro_all,1000,1000,300,Random (uniform),0.012779586660013305,0.0169766403654485,0.8715914719862331,0.8421069885341737
125,mmlu-pro_all,1000,1000,300,Random (uniform),0.014994448602794416,0.014835651993355483,0.9010119396242403,0.8929311150536193
126,mmlu-pro_all,1000,1000,300,Random (uniform),0.012672821024617443,0.01470535714285715,0.8929311150536193,0.8767694659123773
127,mmlu-pro_all,1000,1000,300,Random (uniform),0.008151842149035264,0.010070286544850498,0.9090927641948613,0.8808098781976879
128,mmlu-pro_all,1000,1000,300,Random (uniform),0.01949760894876913,0.017328176910298997,0.8998993477798645,0.9010119396242403
129,mmlu-pro_all,1000,1000,300,Random (uniform),0.014142548236859613,0.014061669435215949,0.8848502904829982,0.8715879076249923
130,mmlu-pro_all,1000,1000,300,Random (uniform),0.017414171656686628,0.019131436877076413,0.903947535911476,0.9149816317727079
131,mmlu-pro_all,1000,1000,300,Random (uniform),0.01274334664005323,0.010644518272425246,0.9190358720814805,0.882598326492347
132,mmlu-pro_all,1000,1000,300,Random (uniform),0.00684668163672655,0.008502699335548172,0.9352226720647772,0.9131331764801718
133,mmlu-pro_all,1000,1000,300,Random (uniform),0.008860778443113771,0.011926702657807308,0.9201255678740976,0.9160810598834421
134,mmlu-pro_all,1000,1000,300,Random (uniform),0.011454777944111776,0.008002387873754157,0.8976738528342286,0.902841407375199
135,mmlu-pro_all,1000,1000,300,Random (uniform),0.01368101297405189,0.013048380398671098,0.9352303367877622,0.9068900235517694
136,mmlu-pro_all,1000,1000,300,Random (uniform),0.016985882401862943,0.011965427740863788,0.8675469639955776,0.8715914719862331
137,mmlu-pro_all,1000,1000,300,Random (uniform),0.008302977378576182,0.013670681063122926,0.8936211492096948,0.8611995202134248
138,mmlu-pro_all,1000,1000,300,Random (uniform),0.014753409846972716,0.013696220930232551,0.8848484848484848,0.8536835829650509
139,mmlu-pro_all,1000,1000,300,Random (uniform),0.014955796739853625,0.014793604651162789,0.9241662964840406,0.8918103648784724
140,mmlu-pro_all,10,10,300,Anchor Points,0.075762853783162,0.07529232612451886,0.5065592891825716,0.5241935483870969
141,mmlu-pro_all,10,10,300,Anchor Points,0.05720122728637695,0.060827373238820495,0.713709677419355,0.7003030810611647
142,mmlu-pro_all,10,10,300,Anchor Points,0.07327540956349042,0.07045451812009183,0.40725806451612906,0.40725806451612906
143,mmlu-pro_all,10,10,300,Anchor Points,0.08221333548467608,0.08441287466766115,0.4435483870967743,0.4274193548387097
144,mmlu-pro_all,10,10,300,Anchor Points,0.08325013367576037,0.07909175607879484,0.6169354838709679,0.619576501111751
145,mmlu-pro_all,10,10,300,Anchor Points,0.07086403733682448,0.07249300977232705,0.5685483870967744,0.560483870967742
146,mmlu-pro_all,10,10,300,Anchor Points,0.09059055591075146,0.09266022609419411,0.528225806451613,0.5443548387096775
147,mmlu-pro_all,10,10,300,Anchor Points,0.08237950551948744,0.08144261334553754,0.5241935483870969,0.5483870967741936
148,mmlu-pro_all,10,10,300,Anchor Points,0.06336930227752247,0.06216064539416847,0.32896081329386123,0.3306451612903226
149,mmlu-pro_all,10,10,300,Anchor Points,0.08356214255443545,0.08271222279263016,0.5000000000000001,0.4944503021901595
150,mmlu-pro_all,10,10,300,Anchor Points,0.05851086476265482,0.05992818516061858,0.3588709677419355,0.3531787872786854
151,mmlu-pro_all,10,10,300,Anchor Points,0.047971721070138085,0.04989869153605048,0.5806451612903226,0.592741935483871
152,mmlu-pro_all,10,10,300,Anchor Points,0.06253557338627105,0.06323225360076548,0.5050515356638118,0.5040322580645162
153,mmlu-pro_all,10,10,300,Anchor Points,0.044079049953139715,0.042773872420769264,0.48387096774193555,0.49596774193548393
154,mmlu-pro_all,10,10,300,Anchor Points,0.07400654346340961,0.07070087684716472,0.5509589081547492,0.560483870967742
155,mmlu-pro_all,10,10,300,Anchor Points,0.06486573179078278,0.06401123231796763,0.5040322580645162,0.5403225806451614
156,mmlu-pro_all,10,10,300,Anchor Points,0.06762420132650807,0.06384356354338108,0.5362903225806452,0.5322580645161291
157,mmlu-pro_all,10,10,300,Anchor Points,0.0903302413214708,0.08927291522062708,0.4339053672280992,0.42583270923315775
158,mmlu-pro_all,10,10,300,Anchor Points,0.08851625142192027,0.08703252866809238,0.5428862501598077,0.5403225806451614
159,mmlu-pro_all,10,10,300,Anchor Points,0.07812089582404132,0.07895715510315024,0.5307772631673958,0.5146319471775129
160,mmlu-pro_all,25,25,300,Anchor Points,0.06423661069242478,0.0629308197272897,0.6397581460991043,0.6612903225806452
161,mmlu-pro_all,25,25,300,Anchor Points,0.04024062109837914,0.042418118263517296,0.6290322580645162,0.6115038431168095
162,mmlu-pro_all,25,25,300,Anchor Points,0.053548567327171386,0.05068982628389544,0.588709677419355,0.588709677419355
163,mmlu-pro_all,25,25,300,Anchor Points,0.053890926607267464,0.05498934418106098,0.6008064516129034,0.6008064516129034
164,mmlu-pro_all,25,25,300,Anchor Points,0.042628007767954675,0.039822463094092625,0.5564516129032259,0.5751768821395734
165,mmlu-pro_all,25,25,300,Anchor Points,0.04359112571297295,0.04518992537875219,0.5201612903225807,0.5120967741935485
166,mmlu-pro_all,25,25,300,Anchor Points,0.05288110258477699,0.055599820740934225,0.5806451612903226,0.5967741935483871
167,mmlu-pro_all,25,25,300,Anchor Points,0.05537202798608016,0.05353856865888738,0.6653225806451614,0.6975806451612904
168,mmlu-pro_all,25,25,300,Anchor Points,0.05010775068752857,0.05182966017844776,0.7043394100586354,0.6814516129032259
169,mmlu-pro_all,25,25,300,Anchor Points,0.04808595617696117,0.04673823633425456,0.6491935483870969,0.6518671330915164
170,mmlu-pro_all,25,25,300,Anchor Points,0.041960004526093005,0.04276537277083644,0.4919354838709678,0.5105956181800423
171,mmlu-pro_all,25,25,300,Anchor Points,0.03373901071456979,0.032603593389779216,0.6733870967741937,0.6774193548387097
172,mmlu-pro_all,25,25,300,Anchor Points,0.03592680240782002,0.036814907150715184,0.5292940093756747,0.5201612903225807
173,mmlu-pro_all,25,25,300,Anchor Points,0.039010218382617035,0.03998226603488154,0.6048387096774195,0.6169354838709679
174,mmlu-pro_all,25,25,300,Anchor Points,0.045949043815946895,0.04410986978409499,0.7164483970510475,0.6814516129032259
175,mmlu-pro_all,25,25,300,Anchor Points,0.033252683336359334,0.031607583448005086,0.7096774193548389,0.7056451612903227
176,mmlu-pro_all,25,25,300,Anchor Points,0.05103708985409297,0.04901194579006157,0.38709677419354843,0.40725806451612906
177,mmlu-pro_all,25,25,300,Anchor Points,0.054173012010532325,0.051959176019022474,0.6115038431168095,0.619576501111751
178,mmlu-pro_all,25,25,300,Anchor Points,0.04086973926987585,0.03963217746222289,0.5348135921648665,0.5080645161290324
179,mmlu-pro_all,25,25,300,Anchor Points,0.044804681182510846,0.04512593112197097,0.4783049862002768,0.45812334121292336
180,mmlu-pro_all,50,50,300,Anchor Points,0.04755296540973503,0.04632187277772331,0.6760851070763406,0.685483870967742
181,mmlu-pro_all,50,50,300,Anchor Points,0.03251052720025321,0.03522028104293798,0.6491935483870969,0.6236128301092216
182,mmlu-pro_all,50,50,300,Anchor Points,0.03852147383100429,0.035610146439447626,0.620967741935484,0.620967741935484
183,mmlu-pro_all,50,50,300,Anchor Points,0.04298597302901276,0.04450779438662328,0.6774193548387097,0.6693548387096775
184,mmlu-pro_all,50,50,300,Anchor Points,0.03375389572425909,0.031645416269789596,0.6653225806451614,0.6760851070763406
185,mmlu-pro_all,50,50,300,Anchor Points,0.0347838332830204,0.03723626338358817,0.7016129032258065,0.6935483870967744
186,mmlu-pro_all,50,50,300,Anchor Points,0.039364696250727665,0.041636468055836956,0.6693548387096775,0.685483870967742
187,mmlu-pro_all,50,50,300,Anchor Points,0.031067598412077825,0.02935991604874763,0.7620967741935485,0.7701612903225807
188,mmlu-pro_all,50,50,300,Anchor Points,0.038517501152783434,0.03784695685931059,0.7124120680535768,0.6975806451612904
189,mmlu-pro_all,50,50,300,Anchor Points,0.035160253563860815,0.03546157910750854,0.7540322580645162,0.7406663710358716
190,mmlu-pro_all,50,50,300,Anchor Points,0.037654854309528384,0.03862725889128404,0.6169354838709679,0.6357218171016337
191,mmlu-pro_all,50,50,300,Anchor Points,0.027429438540403975,0.02670954099139418,0.6451612903225807,0.6733870967741937
192,mmlu-pro_all,50,50,300,Anchor Points,0.02442049452299218,0.0267273206181374,0.703031737644026,0.7056451612903227
193,mmlu-pro_all,50,50,300,Anchor Points,0.033894762639135954,0.03168629057329038,0.7096774193548389,0.713709677419355
194,mmlu-pro_all,50,50,300,Anchor Points,0.030842818976118427,0.028389768474967846,0.7689206740181664,0.7419354838709679
195,mmlu-pro_all,50,50,300,Anchor Points,0.033004913426883165,0.0314231859719172,0.7419354838709679,0.7782258064516131
196,mmlu-pro_all,50,50,300,Anchor Points,0.041245391115394596,0.03855713366809592,0.5443548387096775,0.5645161290322581
197,mmlu-pro_all,50,50,300,Anchor Points,0.03875687862668866,0.03865380368889379,0.7164483970510475,0.7406663710358716
198,mmlu-pro_all,50,50,300,Anchor Points,0.031193238745561014,0.02912143941955892,0.696266752063694,0.7016129032258065
199,mmlu-pro_all,50,50,300,Anchor Points,0.03393366744670635,0.03554401264777708,0.6074675141193389,0.5751768821395734
200,mmlu-pro_all,100,100,300,Anchor Points,0.03403135559910305,0.03285994303969733,0.7487390290308129,0.7822580645161291
201,mmlu-pro_all,100,100,300,Anchor Points,0.029456544360004423,0.03257997634431929,0.717741935483871,0.6881940940687526
202,mmlu-pro_all,100,100,300,Anchor Points,0.03349484026695064,0.030707748781783686,0.7016129032258065,0.685483870967742
203,mmlu-pro_all,100,100,300,Anchor Points,0.02748278760032927,0.026706403027546653,0.7338709677419355,0.7419354838709679
204,mmlu-pro_all,100,100,300,Anchor Points,0.033253247936479116,0.029359171262489252,0.7016129032258065,0.736630042038401
205,mmlu-pro_all,100,100,300,Anchor Points,0.027554589346891832,0.02884811660491167,0.7056451612903227,0.7217741935483872
206,mmlu-pro_all,100,100,300,Anchor Points,0.027660000588285525,0.030804543213017996,0.713709677419355,0.7217741935483872
207,mmlu-pro_all,100,100,300,Anchor Points,0.028143290717478037,0.02659638478992734,0.685483870967742,0.7016129032258065
208,mmlu-pro_all,100,100,300,Anchor Points,0.030173566838490994,0.02981245222326116,0.7204847260485181,0.6975806451612904
209,mmlu-pro_all,100,100,300,Anchor Points,0.029582401529699973,0.02949227583487042,0.7459677419354841,0.7325937130409302
210,mmlu-pro_all,100,100,300,Anchor Points,0.02824039549087102,0.02870543191701963,0.7217741935483872,0.7325937130409302
211,mmlu-pro_all,100,100,300,Anchor Points,0.021920693468689037,0.02290496377108045,0.7580645161290324,0.7862903225806454
212,mmlu-pro_all,100,100,300,Anchor Points,0.030715615429229324,0.031172587705127097,0.6747488516468526,0.6774193548387097
213,mmlu-pro_all,100,100,300,Anchor Points,0.028206497455712007,0.02670072233100706,0.7500000000000001,0.7540322580645162
214,mmlu-pro_all,100,100,300,Anchor Points,0.02397258352113822,0.023034565397120496,0.7487390290308129,0.7459677419354841
215,mmlu-pro_all,100,100,300,Anchor Points,0.030556520954429286,0.02824570838024353,0.7782258064516131,0.806451612903226
216,mmlu-pro_all,100,100,300,Anchor Points,0.030721697902830875,0.03149259310764142,0.6935483870967744,0.7056451612903227
217,mmlu-pro_all,100,100,300,Anchor Points,0.030321866051963922,0.034743293882454666,0.7285573840434596,0.736630042038401
218,mmlu-pro_all,100,100,300,Anchor Points,0.029526502414406547,0.025865890360551903,0.7124120680535768,0.717741935483871
219,mmlu-pro_all,100,100,300,Anchor Points,0.02555637605960727,0.02745810314632264,0.7245210550459887,0.7043394100586354
220,mmlu-pro_all,250,250,300,Anchor Points,0.02729080127211197,0.026682537072185965,0.7608480160232252,0.7741935483870969
221,mmlu-pro_all,250,250,300,Anchor Points,0.023375637825451465,0.025672925660569813,0.7701612903225807,0.7447027000333423
222,mmlu-pro_all,250,250,300,Anchor Points,0.03043433403337844,0.02794456601511364,0.7419354838709679,0.7258064516129034
223,mmlu-pro_all,250,250,300,Anchor Points,0.022740271240491237,0.022583248051859774,0.7701612903225807,0.7701612903225807
224,mmlu-pro_all,250,250,300,Anchor Points,0.025926827044942837,0.02684075136499766,0.7419354838709679,0.7608480160232252
225,mmlu-pro_all,250,250,300,Anchor Points,0.024403715638365946,0.026053632943009768,0.7298387096774195,0.7459677419354841
226,mmlu-pro_all,250,250,300,Anchor Points,0.025322613004436466,0.02824625341940825,0.7620967741935485,0.7701612903225807
227,mmlu-pro_all,250,250,300,Anchor Points,0.023130061192917085,0.02219642522959508,0.7338709677419355,0.7419354838709679
228,mmlu-pro_all,250,250,300,Anchor Points,0.02802920508401923,0.027874622675269578,0.7689206740181664,0.7459677419354841
229,mmlu-pro_all,250,250,300,Anchor Points,0.026611255505564983,0.026234936800789557,0.7701612903225807,0.7729570030156371
230,mmlu-pro_all,250,250,300,Anchor Points,0.024227531324958702,0.025163510479520368,0.7419354838709679,0.7447027000333423
231,mmlu-pro_all,250,250,300,Anchor Points,0.02333050670037195,0.025150234707274576,0.7540322580645162,0.7741935483870969
232,mmlu-pro_all,250,250,300,Anchor Points,0.022116879228319668,0.023187674161332654,0.8000016324914778,0.8024193548387097
233,mmlu-pro_all,250,250,300,Anchor Points,0.022792195571972037,0.02143263788349889,0.7419354838709679,0.7459677419354841
234,mmlu-pro_all,250,250,300,Anchor Points,0.023563781516978733,0.022108148153344956,0.7769933320131078,0.7661290322580647
235,mmlu-pro_all,250,250,300,Anchor Points,0.02335659195626318,0.023271444488626933,0.7661290322580647,0.7782258064516131
236,mmlu-pro_all,250,250,300,Anchor Points,0.025096353806181707,0.02545878779381492,0.7580645161290324,0.7701612903225807
237,mmlu-pro_all,250,250,300,Anchor Points,0.026976092807932256,0.032156565922517165,0.736630042038401,0.7204847260485181
238,mmlu-pro_all,250,250,300,Anchor Points,0.02380540477443581,0.020601463123041737,0.7729570030156371,0.7862903225806454
239,mmlu-pro_all,250,250,300,Anchor Points,0.02230837220713868,0.02370292472028796,0.7487390290308129,0.7447027000333423
240,mmlu-pro_all,500,500,300,Anchor Points,0.023648827210158024,0.0237100553588159,0.7648843450206958,0.7701612903225807
241,mmlu-pro_all,500,500,300,Anchor Points,0.02315189259671162,0.025736015123680397,0.7540322580645162,0.7447027000333423
242,mmlu-pro_all,500,500,300,Anchor Points,0.023461434845361513,0.02383015661804014,0.7379032258064517,0.7217741935483872
243,mmlu-pro_all,500,500,300,Anchor Points,0.021240151115758547,0.021139097638424788,0.7661290322580647,0.7822580645161291
244,mmlu-pro_all,500,500,300,Anchor Points,0.023787624352841638,0.024884148516264578,0.7459677419354841,0.7729570030156371
245,mmlu-pro_all,500,500,300,Anchor Points,0.023486683212394114,0.025869224416475833,0.7459677419354841,0.7540322580645162
246,mmlu-pro_all,500,500,300,Anchor Points,0.02208483901945548,0.023918944682741895,0.7379032258064517,0.7459677419354841
247,mmlu-pro_all,500,500,300,Anchor Points,0.0245698723975225,0.023027653294715233,0.717741935483871,0.7258064516129034
248,mmlu-pro_all,500,500,300,Anchor Points,0.02619601527314163,0.02573240484009698,0.7850659900080493,0.7701612903225807
249,mmlu-pro_all,500,500,300,Anchor Points,0.024443664353693526,0.023586617826272818,0.7620967741935485,0.7810296610105785
250,mmlu-pro_all,500,500,300,Anchor Points,0.022621965677793107,0.023154372894438693,0.7540322580645162,0.7568116870257544
251,mmlu-pro_all,500,500,300,Anchor Points,0.02261116230554675,0.02617520204751166,0.7419354838709679,0.7620967741935485
252,mmlu-pro_all,500,500,300,Anchor Points,0.02184467031664146,0.021925385912467985,0.7757591587796149,0.7943548387096775
253,mmlu-pro_all,500,500,300,Anchor Points,0.023239577686930024,0.021771664330757768,0.7620967741935485,0.7661290322580647
254,mmlu-pro_all,500,500,300,Anchor Points,0.022970416621660533,0.02235929608491168,0.7769933320131078,0.7661290322580647
255,mmlu-pro_all,500,500,300,Anchor Points,0.022845503804178182,0.02273573579866097,0.7822580645161291,0.7943548387096775
256,mmlu-pro_all,500,500,300,Anchor Points,0.023915454115670823,0.023389127192683885,0.7540322580645162,0.7580645161290324
257,mmlu-pro_all,500,500,300,Anchor Points,0.025867181850875075,0.03153729198407455,0.7204847260485181,0.7043394100586354
258,mmlu-pro_all,500,500,300,Anchor Points,0.022156801912420077,0.02022384020524555,0.7527753580282837,0.7661290322580647
259,mmlu-pro_all,500,500,300,Anchor Points,0.022994688691914743,0.024968090599302797,0.7568116870257544,0.7447027000333423
260,mmlu-pro_all,1000,1000,300,Anchor Points,0.024213961927341883,0.023531286326345446,0.7648843450206958,0.7701612903225807
261,mmlu-pro_all,1000,1000,300,Anchor Points,0.0224888808624687,0.024655684408426092,0.7338709677419355,0.7245210550459887
262,mmlu-pro_all,1000,1000,300,Anchor Points,0.02490718834324967,0.02492415284192034,0.713709677419355,0.7056451612903227
263,mmlu-pro_all,1000,1000,300,Anchor Points,0.021878228243129946,0.020987974966404702,0.7540322580645162,0.7701612903225807
264,mmlu-pro_all,1000,1000,300,Anchor Points,0.025086514087219687,0.026532116528519414,0.7620967741935485,0.78910231900552
265,mmlu-pro_all,1000,1000,300,Anchor Points,0.023711510128082522,0.02582953278968015,0.7500000000000001,0.7580645161290324
266,mmlu-pro_all,1000,1000,300,Anchor Points,0.02108149073361002,0.022537918488297688,0.7298387096774195,0.7379032258064517
267,mmlu-pro_all,1000,1000,300,Anchor Points,0.023901933607389843,0.02235825813093175,0.717741935483871,0.7258064516129034
268,mmlu-pro_all,1000,1000,300,Anchor Points,0.02502062486203643,0.023974905726110583,0.7931386480029906,0.7782258064516131
269,mmlu-pro_all,1000,1000,300,Anchor Points,0.02260952954482842,0.021770486370224808,0.7620967741935485,0.7810296610105785
270,mmlu-pro_all,1000,1000,300,Anchor Points,0.022307121440898428,0.023051732032878292,0.7500000000000001,0.7608480160232252
271,mmlu-pro_all,1000,1000,300,Anchor Points,0.024045459805823143,0.02776422500949976,0.7298387096774195,0.7500000000000001
272,mmlu-pro_all,1000,1000,300,Anchor Points,0.022916563697552024,0.022576659025031712,0.751516685067752,0.7701612903225807
273,mmlu-pro_all,1000,1000,300,Anchor Points,0.023271728461858463,0.021312327949090486,0.7540322580645162,0.7580645161290324
274,mmlu-pro_all,1000,1000,300,Anchor Points,0.02156676303539961,0.023205709022750223,0.7850659900080493,0.7741935483870969
275,mmlu-pro_all,1000,1000,300,Anchor Points,0.021903831164941753,0.021924643637449667,0.7903225806451614,0.8024193548387097
276,mmlu-pro_all,1000,1000,300,Anchor Points,0.020503352395064282,0.020068028457775992,0.7661290322580647,0.7701612903225807
277,mmlu-pro_all,1000,1000,300,Anchor Points,0.025657612837783318,0.0314295821881192,0.7285573840434596,0.7124120680535768
278,mmlu-pro_all,1000,1000,300,Anchor Points,0.02244173634057851,0.02054944000154451,0.7487390290308129,0.7620967741935485
279,mmlu-pro_all,1000,1000,300,Anchor Points,0.02265745783828858,0.02430654845856896,0.7447027000333423,0.7325937130409302
280,mmlu-pro_all,10,10,300,tinyBenchmarks,0.09484156686626746,0.09163778616631632,0.28403315553081276,0.2941024058386973
281,mmlu-pro_all,25,25,300,tinyBenchmarks,0.0430649118429807,0.04431227177428089,0.45344222459532424,0.43478460899545274
282,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03175419993346641,0.030805239603848792,0.7178972324985435,0.703031737644026
283,mmlu-pro_all,100,100,300,tinyBenchmarks,0.027143629407850957,0.02663093663171994,0.5025229601851009,0.49596774193548393
284,mmlu-pro_all,250,250,300,tinyBenchmarks,0.026769377910844983,0.026130180115958565,0.7717171717171718,0.748739029030813
285,mmlu-pro_all,500,500,300,tinyBenchmarks,0.014465859946773095,0.014891572751506916,0.8335019379776974,0.7983870967741937
286,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.007713739188289769,0.007972482499010464,0.8980832019372285,0.8951612903225807
287,mmlu-pro_all,10,10,300,tinyBenchmarks,0.08975278609447773,0.09131685756394191,0.490847912078289,0.4851500628381381
288,mmlu-pro_all,25,25,300,tinyBenchmarks,0.06378389055222887,0.06426234256293616,0.27822580645161293,0.2603432203368595
289,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03244552561543581,0.031314682373901156,0.4758064516129033,0.4460143542205113
290,mmlu-pro_all,100,100,300,tinyBenchmarks,0.029680222887558207,0.030878504618669627,0.7003030810611648,0.7151515151515151
291,mmlu-pro_all,250,250,300,tinyBenchmarks,0.018582626413838954,0.017633769697592602,0.8508064516129034,0.8335019379776974
292,mmlu-pro_all,500,500,300,tinyBenchmarks,0.011856495342647972,0.014010375566695956,0.8951612903225807,0.8577199119625216
293,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.018463073852296064,0.019772470562751283,0.8911290322580646,0.8738652279524044
294,mmlu-pro_all,10,10,300,tinyBenchmarks,0.09190473220226215,0.09196241721982393,0.5264641123285974,0.5264641123285974
295,mmlu-pro_all,25,25,300,tinyBenchmarks,0.04326763140385895,0.046210934996010186,0.36917519226059214,0.36917519226059214
296,mmlu-pro_all,50,50,300,tinyBenchmarks,0.0363907601463739,0.03829073925791826,0.5469225791572785,0.5307772631673957
297,mmlu-pro_all,100,100,300,tinyBenchmarks,0.026509481037924165,0.029627027782751246,0.4758064516129033,0.467741935483871
298,mmlu-pro_all,250,250,300,tinyBenchmarks,0.018816533599467732,0.020224563331808124,0.7379032258064517,0.7379032258064517
299,mmlu-pro_all,500,500,300,tinyBenchmarks,0.011180763473053912,0.011615580439342781,0.8024193548387097,0.8024193548387097
300,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.007640968063872421,0.008678186263132374,0.903225806451613,0.8870967741935486
301,mmlu-pro_all,10,10,300,tinyBenchmarks,0.07475153858948769,0.07455936341602507,0.05445085523362902,0.06751906048969998
302,mmlu-pro_all,25,25,300,tinyBenchmarks,0.0650417914171657,0.062649255061306,0.35773540026809053,0.3861916252894159
303,mmlu-pro_all,50,50,300,tinyBenchmarks,0.024627827677977415,0.02355963598826713,0.6639761200839286,0.6639761200839286
304,mmlu-pro_all,100,100,300,tinyBenchmarks,0.03439475216234198,0.03394390275373947,0.653225806451613,0.653225806451613
305,mmlu-pro_all,250,250,300,tinyBenchmarks,0.015744552561543553,0.017376235209536604,0.7245210550459888,0.7164483970510475
306,mmlu-pro_all,500,500,300,tinyBenchmarks,0.011378285096473689,0.013151399996242255,0.8427419354838711,0.8185483870967744
307,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.01261539421157771,0.013618254216926882,0.8750000000000001,0.8427419354838711
308,mmlu-pro_all,10,10,300,tinyBenchmarks,0.06964196606786427,0.06990960631339756,0.15699254058225265,0.1836830324767881
309,mmlu-pro_all,25,25,300,tinyBenchmarks,0.04633441450432469,0.04926321222449896,0.5318528007711817,0.5141710939607694
310,mmlu-pro_all,50,50,300,tinyBenchmarks,0.04985861610113109,0.04904288875184294,0.5161290322580646,0.5428862501598077
311,mmlu-pro_all,100,100,300,tinyBenchmarks,0.028463905522288786,0.026609343605148536,0.6653225806451614,0.6680124490813992
312,mmlu-pro_all,250,250,300,tinyBenchmarks,0.025438705921490338,0.022866448719393976,0.7379032258064517,0.7164483970510475
313,mmlu-pro_all,500,500,300,tinyBenchmarks,0.009423860612109093,0.01172009179315789,0.8629032258064517,0.853683582965051
314,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.008904066866267891,0.010020169378850365,0.9193548387096776,0.9021195309346993
315,mmlu-pro_all,10,10,300,tinyBenchmarks,0.10465527278775785,0.10372768444285299,0.4067193338381712,0.4232358550092645
316,mmlu-pro_all,25,25,300,tinyBenchmarks,0.05671469560878248,0.05682657041951539,0.47676864966663834,0.4848494742372594
317,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03765385894876913,0.03696663290252275,0.48387096774193555,0.4919354838709678
318,mmlu-pro_all,100,100,300,tinyBenchmarks,0.024341941117764443,0.024952831739621515,0.717741935483871,0.717741935483871
319,mmlu-pro_all,250,250,300,tinyBenchmarks,0.01559381237524947,0.0147803092596645,0.7983870967741937,0.8225806451612904
320,mmlu-pro_all,500,500,300,tinyBenchmarks,0.012397080838323301,0.012153741382130465,0.8588709677419356,0.8508064516129034
321,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.010874085163007856,0.010625378534183619,0.9233870967741936,0.9153225806451614
322,mmlu-pro_all,10,10,300,tinyBenchmarks,0.0760458250166334,0.07571439347219183,0.2935971246318526,0.2811036299666674
323,mmlu-pro_all,25,25,300,tinyBenchmarks,0.06961597638057215,0.0699579281359761,0.5131323602344328,0.5131323602344328
324,mmlu-pro_all,50,50,300,tinyBenchmarks,0.04601734031936127,0.044579400860515955,0.2580645161290323,0.2741935483870968
325,mmlu-pro_all,100,100,300,tinyBenchmarks,0.026166417165668653,0.024586375421913644,0.6276491591066923,0.6236128301092215
326,mmlu-pro_all,250,250,300,tinyBenchmarks,0.019247962408516294,0.017035285050054916,0.7258064516129034,0.7258064516129034
327,mmlu-pro_all,500,500,300,tinyBenchmarks,0.01027112441783102,0.011055974982482468,0.8548387096774194,0.8467741935483872
328,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009631778110445424,0.010143172265656688,0.8951612903225807,0.870967741935484
329,mmlu-pro_all,10,10,300,tinyBenchmarks,0.07474114271457089,0.0751579862147245,0.4273649853020378,0.4149775944237179
330,mmlu-pro_all,25,25,300,tinyBenchmarks,0.04842918330006652,0.04768181314558036,0.47522968890200645,0.4914077208646279
331,mmlu-pro_all,50,50,300,tinyBenchmarks,0.04034119261477047,0.040306786067952696,0.40725806451612906,0.43145161290322587
332,mmlu-pro_all,100,100,300,tinyBenchmarks,0.024866932801064577,0.026388944149353607,0.6572580645161291,0.6411290322580646
333,mmlu-pro_all,250,250,300,tinyBenchmarks,0.02476297405189623,0.026897001954010055,0.7540322580645162,0.7379032258064517
334,mmlu-pro_all,500,500,300,tinyBenchmarks,0.009709747172321995,0.010946317497894561,0.8588709677419356,0.866935483870968
335,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009449850299401659,0.009822941326649345,0.8790322580645162,0.870967741935484
336,mmlu-pro_all,10,10,300,tinyBenchmarks,0.11776447105788425,0.11684897784718493,0.2851976328694707,0.31587846739715836
337,mmlu-pro_all,25,25,300,tinyBenchmarks,0.05385583000665334,0.05174464539912421,0.5612994519979263,0.5587090323667151
338,mmlu-pro_all,50,50,300,tinyBenchmarks,0.045029732202262224,0.04772934405275835,0.5993948561243975,0.5846774193548389
339,mmlu-pro_all,100,100,300,tinyBenchmarks,0.025823353293413197,0.02591961011531755,0.696266752063694,0.6814516129032259
340,mmlu-pro_all,250,250,300,tinyBenchmarks,0.028261185961410543,0.029902525375605407,0.7729570030156371,0.7338709677419355
341,mmlu-pro_all,500,500,300,tinyBenchmarks,0.012038423153692539,0.012730947296989571,0.8577199119625216,0.8548387096774194
342,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009917664670659267,0.012530697188127046,0.9223011759220526,0.9072580645161292
343,mmlu-pro_all,10,10,300,tinyBenchmarks,0.09121860445775118,0.09077066313110213,0.33723785056176153,0.3479334862680338
344,mmlu-pro_all,25,25,300,tinyBenchmarks,0.03925482368596141,0.04158532326266626,0.45408701221545267,0.4282828282828283
345,mmlu-pro_all,50,50,300,tinyBenchmarks,0.0341192614770459,0.0351743978819769,0.5953585271269268,0.5535353535353535
346,mmlu-pro_all,100,100,300,tinyBenchmarks,0.024430306054557545,0.02676454952774406,0.6975806451612904,0.6518671330915164
347,mmlu-pro_all,250,250,300,tinyBenchmarks,0.022813747504989983,0.024252778358012612,0.7540322580645162,0.7487390290308129
348,mmlu-pro_all,500,500,300,tinyBenchmarks,0.009356287425149691,0.011258546030861861,0.8588709677419356,0.8577199119625216
349,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.006533807385229769,0.007629677660183675,0.9233870967741936,0.8980832019372285
350,mmlu-pro_all,10,10,300,tinyBenchmarks,0.08637412674650699,0.08545665106663748,0.16758445334664226,0.2029612003230197
351,mmlu-pro_all,25,25,300,tinyBenchmarks,0.050607119095143045,0.052146164481667895,0.3329971422913319,0.33131313131313134
352,mmlu-pro_all,50,50,300,tinyBenchmarks,0.045294827012641405,0.04584484823265984,0.5403225806451614,0.5428862501598077
353,mmlu-pro_all,100,100,300,tinyBenchmarks,0.03002328675981371,0.03071535500427717,0.6074675141193389,0.6222222222222222
354,mmlu-pro_all,250,250,300,tinyBenchmarks,0.04142756154357954,0.04091120085410578,0.6975806451612904,0.696266752063694
355,mmlu-pro_all,500,500,300,tinyBenchmarks,0.010000831669993318,0.0084498362079827,0.8870967741935486,0.8940468729397578
356,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.007958042248836138,0.009096663400773683,0.9072580645161292,0.8819378859473458
357,mmlu-pro_all,10,10,300,tinyBenchmarks,0.10343375748502993,0.10716761037725213,0.06742167476420635,0.03881854001575517
358,mmlu-pro_all,25,25,300,tinyBenchmarks,0.06061834664005326,0.061451418867691,0.535897308761837,0.5075857528272494
359,mmlu-pro_all,50,50,300,tinyBenchmarks,0.050820234530938146,0.050633060429528545,0.5549952371522199,0.5590315661496906
360,mmlu-pro_all,100,100,300,tinyBenchmarks,0.0317853875582169,0.03369778645919678,0.35483870967741943,0.342741935483871
361,mmlu-pro_all,250,250,300,tinyBenchmarks,0.013883690951430477,0.016591806005928345,0.7540322580645162,0.7338709677419355
362,mmlu-pro_all,500,500,300,tinyBenchmarks,0.00928351630073189,0.01116341514092526,0.8508064516129034,0.838709677419355
363,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009013223552894191,0.010967751650629852,0.8467741935483872,0.8508064516129034
364,mmlu-pro_all,10,10,300,tinyBenchmarks,0.10037217232202264,0.10178565211769154,0.312126534003211,0.29878246360439453
365,mmlu-pro_all,25,25,300,tinyBenchmarks,0.045726255821689994,0.04636394086135596,0.39918966790658006,0.39433952908889897
366,mmlu-pro_all,50,50,300,tinyBenchmarks,0.032580671989354604,0.031053417804479604,0.5171727725197433,0.5161290322580646
367,mmlu-pro_all,100,100,300,tinyBenchmarks,0.01816159347970725,0.018457513268037566,0.6585872025056106,0.653225806451613
368,mmlu-pro_all,250,250,300,tinyBenchmarks,0.017033641051230917,0.01807005106619547,0.791920807920857,0.7983870967741937
369,mmlu-pro_all,500,500,300,tinyBenchmarks,0.01691408848968726,0.01681086884923396,0.7070721499293365,0.7016129032258065
370,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009122380239520651,0.009378084224684377,0.8848502904829983,0.8629032258064517
371,mmlu-pro_all,10,10,300,tinyBenchmarks,0.12895043246839652,0.12745525007570682,-0.11906380553030024,-0.11906380553030024
372,mmlu-pro_all,25,25,300,tinyBenchmarks,0.051828634397870915,0.04998468939750619,0.4080816408163599,0.4080816408163599
373,mmlu-pro_all,50,50,300,tinyBenchmarks,0.050924193280106445,0.05003577078953941,0.33939463196608155,0.34343504425139204
374,mmlu-pro_all,100,100,300,tinyBenchmarks,0.028999293080505646,0.028994713369495776,0.4783049862002767,0.4661959992078647
375,mmlu-pro_all,250,250,300,tinyBenchmarks,0.013862899201596817,0.016139651068847932,0.7903225806451614,0.7943548387096775
376,mmlu-pro_all,500,500,300,tinyBenchmarks,0.010395874916832828,0.014409663507978248,0.866935483870968,0.8548387096774194
377,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.013894086826347898,0.01818053745775402,0.8629032258064517,0.8508064516129034
378,mmlu-pro_all,10,10,300,tinyBenchmarks,0.08598947937458419,0.08510077367413571,0.38112343371361984,0.39102928726232605
379,mmlu-pro_all,25,25,300,tinyBenchmarks,0.04373544577511641,0.046253678965435654,0.4637677841792605,0.45309519374936397
380,mmlu-pro_all,50,50,300,tinyBenchmarks,0.02747109946773121,0.027735582959219976,0.5884735060762709,0.5979810182259532
381,mmlu-pro_all,100,100,300,tinyBenchmarks,0.021815743512974058,0.02428560998158722,0.6559034620889871,0.6411290322580646
382,mmlu-pro_all,250,250,300,tinyBenchmarks,0.016399492681304036,0.02205381250234851,0.7487390290308129,0.7580645161290324
383,mmlu-pro_all,500,500,300,tinyBenchmarks,0.013623794078509652,0.014068647726695,0.8779015569498752,0.8548387096774194
384,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.015302727877579365,0.012589031516148836,0.9021195309346993,0.866935483870968
385,mmlu-pro_all,10,10,300,tinyBenchmarks,0.07722055888223554,0.07699912329272796,0.27468089237203636,0.27061154581837654
386,mmlu-pro_all,25,25,300,tinyBenchmarks,0.07110778443113769,0.07304619028830486,0.42265108502348653,0.41860657703283116
387,mmlu-pro_all,50,50,300,tinyBenchmarks,0.032934131736526956,0.033992383450153964,0.5872858691319854,0.5993948561243975
388,mmlu-pro_all,100,100,300,tinyBenchmarks,0.019882110778443134,0.019109901045085927,0.8024193548387097,0.8145161290322581
389,mmlu-pro_all,250,250,300,tinyBenchmarks,0.013462658017298727,0.013484520438635475,0.801211305997932,0.8213929509852854
390,mmlu-pro_all,500,500,300,tinyBenchmarks,0.0132755322687957,0.014888937518097785,0.7338709677419355,0.713709677419355
391,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.00907559880239495,0.010125533816088708,0.8346774193548387,0.8306451612903227
392,mmlu-pro_all,10,10,300,tinyBenchmarks,0.1216005489021956,0.1200489662977478,0.2176963155121273,0.20948136020978286
393,mmlu-pro_all,25,25,300,tinyBenchmarks,0.06476630073186959,0.06362633260610562,0.5520753407244585,0.5480308327338032
394,mmlu-pro_all,50,50,300,tinyBenchmarks,0.04014367099135061,0.03789470823027255,0.4509626409580742,0.4711851809113511
395,mmlu-pro_all,100,100,300,tinyBenchmarks,0.031998502994011975,0.03119362051091614,0.4823413151977474,0.4863776441952181
396,mmlu-pro_all,250,250,300,tinyBenchmarks,0.01682572355289418,0.015600640496415779,0.7620967741935485,0.7661290322580647
397,mmlu-pro_all,500,500,300,tinyBenchmarks,0.018275948103792263,0.023072307765421393,0.8577199119625217,0.8456109249701095
398,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.011601796407186268,0.01587772682763014,0.931451612903226,0.9112903225806452
399,mmlu-pro_all,10,10,300,tinyBenchmarks,0.07984551729873586,0.08361877090227075,0.37842621511471203,0.3171169400402615
400,mmlu-pro_all,25,25,300,tinyBenchmarks,0.038792207252162356,0.03580156616445958,0.5469225791572785,0.5832495401345147
401,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03068342481703261,0.030069791894616088,0.39354207725339224,0.3895057482559216
402,mmlu-pro_all,100,100,300,tinyBenchmarks,0.026041666666666647,0.025772928119397964,0.7285573840434596,0.6841577650712819
403,mmlu-pro_all,250,250,300,tinyBenchmarks,0.0199860695276115,0.016470592176665532,0.7527753580282837,0.7729570030156371
404,mmlu-pro_all,500,500,300,tinyBenchmarks,0.008742930805056559,0.013265488679341261,0.8496472539675803,0.8173566219878148
405,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.006866475382567886,0.012154687717587816,0.8779015569498752,0.8496472539675803
406,mmlu-pro_all,10,10,300,tinyBenchmarks,0.09399430306054558,0.09474150743363771,0.4226690815151907,0.4076826926715409
407,mmlu-pro_all,25,25,300,tinyBenchmarks,0.05302416001330672,0.053501071638561194,0.549141341124325,0.5263201029541519
408,mmlu-pro_all,50,50,300,tinyBenchmarks,0.02875499001996007,0.030618144939357154,0.5803845231748789,0.5777789567994007
409,mmlu-pro_all,100,100,300,tinyBenchmarks,0.04176022954091817,0.04187208362897682,0.5993948561243975,0.5725806451612904
410,mmlu-pro_all,250,250,300,tinyBenchmarks,0.021088032268795742,0.021635870700459546,0.8456109249701096,0.8427419354838711
411,mmlu-pro_all,500,500,300,tinyBenchmarks,0.020864520958083783,0.020316136829331324,0.8375382669751681,0.8185483870967744
412,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.010790918163673113,0.01233503369783161,0.8738652279524044,0.8306451612903227
413,mmlu-pro_all,10,10,300,tinyBenchmarks,0.11463531270791748,0.11714437266110081,0.32380380610626686,0.3381495443514812
414,mmlu-pro_all,25,25,300,tinyBenchmarks,0.045487150698602805,0.04448654946474716,0.39271335522987905,0.41295631168502744
415,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03896373918829011,0.039752416540120225,0.5469225791572785,0.5428862501598077
416,mmlu-pro_all,100,100,300,tinyBenchmarks,0.030776987691284098,0.029303819686208973,0.7043394100586354,0.696266752063694
417,mmlu-pro_all,250,250,300,tinyBenchmarks,0.017563830671989356,0.017767182274432303,0.7003030810611647,0.6599397910864578
418,mmlu-pro_all,500,500,300,tinyBenchmarks,0.013337907518296693,0.014332575159757985,0.7971749770004612,0.7931386480029906
419,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.01214238190286165,0.011796423763768766,0.90615585993217,0.9021195309346993
420,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.16280979707252163,0.1637428769261035,0.08133204690032761,0.08338817594100265
421,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.109949372089155,0.11104619560712461,0.4928973181406656,0.5324553760098416
422,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.17465069860279442,0.1785082382300736,0.32826585827547833,0.31587846739715836
423,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.16413527112441784,0.1634809181470945,0.3967177255378458,0.375039161300805
424,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.09802010562208917,0.10156088535829781,0.5202114337359711,0.5097967894444193
425,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.07578852711244179,0.07634054461674658,0.3725861695498219,0.3683761563345696
426,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.14965641633399868,0.14866507122797595,0.1364055678857943,0.12400506171435847
427,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.12154856952761145,0.12086778132549961,-0.09132167229864412,-0.08707415265684672
428,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.13185607950765138,0.130135931763273,0.34362855398524533,0.34963905315407867
429,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.1021914504324684,0.1019233266578692,0.31646797366252666,0.3272771285547231
430,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.1387693363273453,0.1400101893389301,0.16189403145068332,0.15582449833474285
431,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.07108959165003327,0.07549798831185911,0.4948483474217486,0.4731444725348298
432,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.10402632235528941,0.10330102875644061,0.35725155609856163,0.3691884531452314
433,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.09322760728542914,0.09054058100576257,0.26454413165729807,0.26867763371444336
434,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.07717117847638058,0.07386864732605664,0.7143060499659395,0.7135856200802015
435,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.09555888223552894,0.09579685106531124,0.23560961725728108,0.22734261314299054
436,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.16857171074517632,0.16249846134143672,0.20546165870244237,0.2098331833556858
437,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.12524950099800397,0.12467828048222492,-0.10612797431992202,-0.0693913678245644
438,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.09142912092481703,0.09440620247600037,0.09650479431741836,0.08173676140952675
439,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.11360872005988024,0.11582514822238138,0.4696261357858123,0.4758326485935543
440,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06776637696828564,0.06872358377099622,0.44219430811175925,0.43972195436284073
441,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.045790103820137505,0.0435615632545356,0.4307401010047973,0.4210534942670868
442,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.058550433854513195,0.06076302832227756,0.4989950400884927,0.5071087805777366
443,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06933442143490796,0.06992424400460799,0.27125728383021674,0.24291697059422396
444,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.05507821163229096,0.05397725739458697,0.6882647500169679,0.6950386716075405
445,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.047653910928143715,0.0458251182320667,0.3384035316985917,0.34650900551173164
446,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06527119372366377,0.06590302806163237,0.4584263376422738,0.4421988566637862
447,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06117192697937459,0.0596848195220854,0.5060770220712999,0.5384659514838631
448,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.04755177145708583,0.04872502710525792,0.41540212151471084,0.3967643853038991
449,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.050698516162120205,0.052306900767954675,0.4534450117758847,0.42756023238831203
450,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.04943255849412287,0.05003120374238604,0.527393131800846,0.49746833816309105
451,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.05793794355732979,0.05847562097105161,0.48736321287397255,0.48331870488331713
452,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06938458153138168,0.06862231868166582,0.5595189369142662,0.5462066863178955
453,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.07292221113328898,0.07405662558235318,0.24696558677079433,0.26316005147707594
454,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.05388875027722334,0.05345084332995141,0.4251020855581165,0.43478460899545274
455,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.10793820345420271,0.10672046370759773,0.4630961649300403,0.4590516569393849
456,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.07778635437458417,0.07374513932406505,0.31408711025917196,0.32624532097888187
457,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.042227004324683966,0.04568862114272746,0.2723627868825332,0.25406976388296004
458,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.04234889595808383,0.039379840989394406,0.5920801190397681,0.613841425460019
459,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.053829667054779325,0.0542185556673659,0.40324401064110954,0.41540212151471084
460,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03608426747695084,0.03665977758630024,0.6545454545454545,0.6357218171016337
461,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03298300809996169,0.033213818972679165,0.6411290322580646,0.6478308040940458
462,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03970885114691251,0.041959817099835026,0.37411698913562214,0.3579389571730006
463,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.043460846164813244,0.04307564273446463,0.6909105007880946,0.6909105007880946
464,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.030673722000443555,0.03153195328151374,0.5872858691319854,0.5898989898989899
465,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.04084082991424561,0.041511090997837,0.5685483870967744,0.5766129032258065
466,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03859913537823341,0.041877158512474345,0.5590315661496906,0.5671042241446319
467,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03287041987453663,0.0336457057737607,0.7016129032258065,0.6693548387096775
468,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03888230483477488,0.0399447974053124,0.6397581460991043,0.6169354838709679
469,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.055883149375851475,0.05414141493682992,0.3579389571730006,0.36842180748370096
470,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03936068834553117,0.0400301881586587,0.5549952371522199,0.5414141414141413
471,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03266952784246323,0.03525630041442892,0.7096774193548389,0.713709677419355
472,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.030176553721261185,0.030813174996982166,0.5399396086679187,0.5348135921648665
473,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.04345224069057655,0.043946983217642524,0.6814516129032259,0.685483870967742
474,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.06183210835273898,0.06636846148396269,0.48888888888888893,0.49848663118763026
475,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.04285596094517313,0.04247309716231065,0.56825337268708,0.5642088646964246
476,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.027655842828232427,0.025556331993865714,0.5590315661496906,0.5630678951471613
477,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03031026653703176,0.029193378673470084,0.5509589081547492,0.5590315661496906
478,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.04198005897926371,0.0424576155977543,0.44534504201326497,0.42669559301414195
479,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.033783734614105124,0.03514109510537803,0.4500506832179819,0.4419780252230405
480,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.03280621465852567,0.03454000969680913,0.7648843450206958,0.7459677419354841
481,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.024207396013198462,0.027018694971130377,0.8104838709677421,0.8092839639928733
482,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.023722087075848306,0.025401916333552342,0.6046539446029784,0.6046539446029784
483,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02167129981813117,0.022829294032527103,0.6922304230662233,0.684157765071282
484,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.030483691036849392,0.02861342699298562,0.3951612903225807,0.4016147352483337
485,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02896704293618638,0.030129384930400238,0.748739029030813,0.7245210550459888
486,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.023077064581923143,0.022611336577137896,0.6895161290322582,0.6814516129032259
487,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02837726704329436,0.02778922777483976,0.6491935483870969,0.6250000000000001
488,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02630502597030457,0.02593803305271687,0.7070707070707071,0.7083757390561061
489,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.044906710625067084,0.0465008921402206,0.8266129032258066,0.8133202929903441
490,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.028474091655869553,0.030068264218464244,0.6572580645161291,0.6639761200839285
491,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.026755478552160913,0.025540740452102736,0.7056451612903227,0.717741935483871
492,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02324948730185135,0.023761174596558722,0.6787892639321631,0.6451612903225807
493,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.03832747999958941,0.038703917225576825,0.7379032258064517,0.7419354838709679
494,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.025710723439484648,0.02693194150789495,0.6922304230662234,0.7016129032258065
495,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.037617520342718015,0.03719560190365863,0.7580645161290324,0.7701612903225807
496,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.0292417472255781,0.03187035443087714,0.560483870967742,0.5564516129032259
497,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.03469995978611565,0.03176831548044251,0.593939393939394,0.6222222222222222
498,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.020724664165677358,0.021176608817276298,0.6922304230662234,0.6814516129032259
499,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02459712271884806,0.02602076113400129,0.736630042038401,0.7447027000333423
500,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.017719822177459123,0.01693427328011409,0.7971749770004612,0.7701612903225807
501,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.02156436352086496,0.023841215339747437,0.7500000000000001,0.7204847260485181
502,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.015738090545227887,0.019001125380238737,0.7338709677419355,0.7258064516129034
503,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.02409873896829782,0.023415312078997795,0.6653225806451614,0.6330645161290324
504,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01449798456461477,0.016256216437086583,0.7983870967741937,0.7971749770004612
505,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.013024263114870148,0.01455210335917256,0.8548387096774194,0.8870967741935486
506,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.016153804024035663,0.018663638198645614,0.7741935483870969,0.7741935483870969
507,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.020728998411635648,0.02103838512647325,0.8104838709677421,0.7862903225806454
508,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.02147178349310047,0.020833149570950275,0.6155401721142802,0.620967741935484
509,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01750864340808532,0.016100695955649176,0.7661290322580647,0.7568116870257544
510,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.014977219094655094,0.014167913084811603,0.7056451612903227,0.7164483970510475
511,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.014206287328914189,0.013838138239230678,0.8104838709677421,0.8225806451612904
512,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01811807568897924,0.017891770271022227,0.6949509130734051,0.6733870967741937
513,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01599982500198062,0.017762143112351532,0.806451612903226,0.8266129032258066
514,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.022152134431933194,0.026402514143466005,0.7204847260485181,0.7298387096774195
515,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01562044174312454,0.015823154906672673,0.7379032258064517,0.717741935483871
516,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.015491355350810322,0.013936553756484638,0.7983870967741937,0.8104838709677421
517,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.021203611134206997,0.025326060674546163,0.7406663710358716,0.7164483970510475
518,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.023262599020999516,0.025648664881276057,0.7164483970510475,0.685483870967742
519,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.021560046327897982,0.0200858618022371,0.7931386480029906,0.7931386480029906
520,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.01399568351979811,0.014113510070867072,0.8294656089802268,0.8145161290322581
521,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.014158472860796922,0.015700830252374696,0.8467741935483872,0.8213929509852854
522,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.009856286408205626,0.008417258219563451,0.8508064516129034,0.866935483870968
523,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.011322821112597295,0.012467693814986022,0.9072580645161292,0.8750000000000001
524,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.010874392234259737,0.014096403730151563,0.8548387096774194,0.8617562409599923
525,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.013514287273289833,0.011884315446335546,0.8548387096774194,0.8629032258064517
526,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.01033738895444665,0.013359279762229687,0.899193548387097,0.866935483870968
527,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.012200745559414698,0.013656806157852273,0.8548387096774194,0.8548387096774194
528,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.00884106374040294,0.0112788827601905,0.90615585993217,0.8790322580645162
529,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.010182329524849716,0.01236847107431538,0.8629032258064517,0.8213929509852854
530,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.016892123034974192,0.016532421146504807,0.9072580645161292,0.8900105439422872
531,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.011705135317854531,0.012424346142552048,0.8870967741935486,0.8911290322580646
532,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.010851651239708736,0.010847814978102886,0.8242441062033409,0.8346774193548387
533,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.010108690822350299,0.011580482842520994,0.8830645161290324,0.8629032258064517
534,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.013501026048661085,0.018794192687004583,0.8133202929903441,0.806451612903226
535,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.012388503919996562,0.012652170298504708,0.8266129032258066,0.8145161290322581
536,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.012407937763944001,0.011558983938104787,0.8306451612903227,0.8266129032258066
537,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.00784305011780511,0.015436901837860974,0.9223011759220526,0.8819378859473458
538,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.010762460558413948,0.01294049909859716,0.8294656089802268,0.806451612903226
539,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.009932659603215316,0.011044968183887657,0.8254292799827562,0.8092839639928733
540,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.0053833834807232965,0.0051765387667791745,0.9263375049195233,0.9233870967741936
541,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.0072850342068641796,0.009796799324870037,0.9072580645161292,0.8900105439422872
542,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.008403910256186801,0.012355529251060838,0.903225806451613,0.8951612903225807
543,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.006933590818920742,0.009630344837405056,0.8870967741935486,0.838709677419355
544,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.010068687742111738,0.007458119603080958,0.8830645161290324,0.8738652279524044
545,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.006469245545919197,0.010175386035670538,0.935483870967742,0.9112903225806452
546,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.006756698534364159,0.007294698871457933,0.931451612903226,0.9072580645161292
547,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.008334181908994964,0.010853491289080397,0.8750000000000001,0.8346774193548387
548,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.007542541209834678,0.009769848838925904,0.918264846924582,0.8870967741935486
549,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.010907464040389468,0.014014971576604918,0.935483870967742,0.9465191499068768
550,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.00842948351887414,0.009517352620113415,0.8870967741935486,0.8980832019372285
551,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.006830870673735829,0.00804353561646785,0.9233870967741936,0.903225806451613
552,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.013019617500157604,0.01397083032710721,0.8969715273389298,0.8629032258064517
553,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.01027585251907704,0.010479460891113672,0.8588709677419356,0.8629032258064517
554,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.008295386074064423,0.013381528596947091,0.8779015569498752,0.8750000000000001
555,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.007225071357909949,0.009411330195991,0.9153225806451614,0.8790322580645162
556,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.00825959743402207,0.01332875645065838,0.8629032258064517,0.8508064516129034
557,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.010254475492078058,0.016761362894708885,0.8617562409599923,0.8698288989549336
558,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.007055376277324523,0.007740063433137073,0.9021195309346993,0.870967741935484
559,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.007797243271062713,0.009464379013272637,0.8900105439422872,0.8819378859473458
