,Dataset,Number of sampled points,Fraction of sampled points,Number of source models,Technique,Mean estimation error against seen accuracies,Mean estimation error against unseen accuracies,Kendall tau correlation against seen accuracies,Kendall tau correlation against unseen accuracies
0,mmlu-pro_all,10,10,300,Random (uniform),0.11737434222464162,0.11695862277257629,0.04161251892882395,0.12483755678647185
1,mmlu-pro_all,10,10,300,Random (uniform),0.07042279078207221,0.07034128662035637,0.5169078021169037,0.5964320793656581
2,mmlu-pro_all,10,10,300,Random (uniform),0.06708703804512185,0.07666868015705224,0.6785757999509486,0.6785757999509486
3,mmlu-pro_all,10,10,300,Random (uniform),0.14125083166999333,0.14315916641498036,0.832250378576479,0.832250378576479
4,mmlu-pro_all,10,10,300,Random (uniform),0.12631101433496644,0.13011174871639988,0.20806259464411975,0.1664500757152958
5,mmlu-pro_all,10,10,300,Random (uniform),0.13016391459505233,0.13388704318936873,0.5703845117952784,0.5703845117952784
6,mmlu-pro_all,10,10,300,Random (uniform),0.1000907276356378,0.10027182120205375,0.6883951004425773,0.7277319633250104
7,mmlu-pro_all,10,10,300,Random (uniform),0.12027157805600917,0.12734823316218663,0.8174238913695991,0.7006490497453707
8,mmlu-pro_all,10,10,300,Random (uniform),0.12566382386741667,0.12564180006040473,0.14394032120171638,0.22619193331698287
9,mmlu-pro_all,10,10,300,Random (uniform),0.12235528942115766,0.12691029900332224,0.0,-0.05337605126836238
10,mmlu-pro_all,25,25,300,Random (uniform),0.057587249742938364,0.06425551192993052,0.7741362848505949,0.7741362848505949
11,mmlu-pro_all,25,25,300,Random (uniform),0.1351357890280046,0.12808819087888856,0.42640143271122083,0.38376128944009874
12,mmlu-pro_all,25,25,300,Random (uniform),0.10044335571281676,0.0953518574448807,0.30510802855858954,0.22883102141894215
13,mmlu-pro_all,25,25,300,Random (uniform),0.07182120607270309,0.07862277257626095,0.5371291452680611,0.574172534596893
14,mmlu-pro_all,25,25,300,Random (uniform),0.05976410814734168,0.0573542736333434,0.30510802855858954,0.26696952498876586
15,mmlu-pro_all,25,25,300,Random (uniform),0.05624266618278593,0.06025974025974024,0.2335496832484569,0.2724746304565331
16,mmlu-pro_all,25,25,300,Random (uniform),0.04233230508679613,0.04260344306855936,0.5449492609130662,0.5449492609130662
17,mmlu-pro_all,25,25,300,Random (uniform),0.11557914474082137,0.12255813953488372,0.019668431441216497,0.09834215720608248
18,mmlu-pro_all,25,25,300,Random (uniform),0.21891489747777174,0.21604953186348536,0.5720775535473553,0.6483545606870027
19,mmlu-pro_all,25,25,300,Random (uniform),0.11127200145164216,0.11649652672908489,0.3432465321284132,0.404145188432738
20,mmlu-pro_all,50,50,300,Random (uniform),0.07393999879029821,0.08101781938991243,0.5060243137049899,0.5449492609130662
21,mmlu-pro_all,50,50,300,Random (uniform),0.05580475412810742,0.06361824222289338,0.15569978883230462,0.038924947208076155
22,mmlu-pro_all,50,50,300,Random (uniform),0.08410875219258453,0.09546662639685898,0.7395739969534469,0.7395739969534469
23,mmlu-pro_all,50,50,300,Random (uniform),0.05278896751950643,0.04600422832980972,0.2724746304565331,0.2335496832484569
24,mmlu-pro_all,50,50,300,Random (uniform),0.049147765075908796,0.04270613107822411,0.7706746355884524,0.7706746355884524
25,mmlu-pro_all,50,50,300,Random (uniform),0.03246718683844432,0.036254907882814845,0.5449492609130662,0.5060243137049899
26,mmlu-pro_all,50,50,300,Random (uniform),0.026136817274541844,0.023328299607369388,0.8174238913695991,0.778498944161523
27,mmlu-pro_all,50,50,300,Random (uniform),0.054363394423274654,0.06059498640893989,0.7479575920067657,0.6731618328060892
28,mmlu-pro_all,50,50,300,Random (uniform),0.04631827254581745,0.04970099667774086,0.40655781409087083,0.3252462512726967
29,mmlu-pro_all,50,50,300,Random (uniform),0.03741607693703502,0.03766233766233767,0.4720343200308506,0.381107966983353
30,mmlu-pro_all,100,100,300,Random (uniform),0.039615314824895664,0.045732407127755954,0.9174698042719671,0.9908673886137245
31,mmlu-pro_all,100,100,300,Random (uniform),0.0237706405371076,0.018302627604953194,0.8874245216579991,0.8496617760555312
32,mmlu-pro_all,100,100,300,Random (uniform),0.03158591907094901,0.02698580489278165,0.7105597124064276,0.6357639532057509
33,mmlu-pro_all,100,100,300,Random (uniform),0.05857315671686927,0.0473059498640894,0.8009085749662975,0.8390470785361213
34,mmlu-pro_all,100,100,300,Random (uniform),0.033359341922216186,0.04099969797644216,0.8227533512074424,0.8227533512074424
35,mmlu-pro_all,100,100,300,Random (uniform),0.03794713603096836,0.037009966777408636,0.697277051246695,0.697277051246695
36,mmlu-pro_all,100,100,300,Random (uniform),0.07001088731627654,0.07222289338568409,0.7479575920067657,0.8227533512074424
37,mmlu-pro_all,100,100,300,Random (uniform),0.0340446379967338,0.02738749622470553,0.5983660736054126,0.7479575920067657
38,mmlu-pro_all,100,100,300,Random (uniform),0.030215931772817995,0.027369374811235272,0.9174698042719671,0.9908673886137245
39,mmlu-pro_all,100,100,300,Random (uniform),0.04854654327708219,0.04573844759891272,0.09440686400617011,0.1905539834916765
40,mmlu-pro_all,250,250,300,Random (uniform),0.03160333877699148,0.03754092419208698,0.7706746355884524,0.8440722199302098
41,mmlu-pro_all,250,250,300,Random (uniform),0.0322266981189137,0.02703473270915132,0.8545454545454545,0.8181818181818182
42,mmlu-pro_all,250,250,300,Random (uniform),0.02381068166696909,0.01706735125339778,0.7853554716071041,0.7105597124064276
43,mmlu-pro_all,250,250,300,Random (uniform),0.01635589427206195,0.022086378737541528,0.8909090909090909,0.8545454545454545
44,mmlu-pro_all,250,250,300,Random (uniform),0.025163249259057652,0.01971368166717005,0.8073734277593311,0.7339758434175737
45,mmlu-pro_all,250,250,300,Random (uniform),0.024865178733442208,0.030000604047115666,0.8440722199302098,0.8073734277593311
46,mmlu-pro_all,250,250,300,Random (uniform),0.04582906913445835,0.048041075203865906,0.8334762598987157,0.9075630385563792
47,mmlu-pro_all,250,250,300,Random (uniform),0.05400024194036169,0.04776864995469646,0.574172534596893,0.5371291452680611
48,mmlu-pro_all,250,250,300,Random (uniform),0.012637270912720015,0.012010872848082164,0.9075630385563792,0.8334762598987157
49,mmlu-pro_all,250,250,300,Random (uniform),0.02306635214419646,0.02093023255813954,0.9541685964428458,0.9074074074074073
50,mmlu-pro_all,500,500,300,Random (uniform),0.009862577874553925,0.01245484747810328,0.697277051246695,0.6238794669049376
51,mmlu-pro_all,500,500,300,Random (uniform),0.022558882235528942,0.030027786167321047,0.9272727272727274,0.8181818181818182
52,mmlu-pro_all,500,500,300,Random (uniform),0.01365269461077845,0.020217456961643007,0.8334762598987157,0.759389481241052
53,mmlu-pro_all,500,500,300,Random (uniform),0.013087884836387814,0.010628209000302024,0.7454545454545454,0.7090909090909091
54,mmlu-pro_all,500,500,300,Random (uniform),0.015975201112925674,0.020999697976442157,0.9541685964428458,0.9541685964428458
55,mmlu-pro_all,500,500,300,Random (uniform),0.01374499485876733,0.012553911205073999,0.9541685964428458,0.8807710121010884
56,mmlu-pro_all,500,500,300,Random (uniform),0.010907155386197293,0.011401993355481725,0.8440722199302098,0.9174698042719671
57,mmlu-pro_all,500,500,300,Random (uniform),0.020233956329764727,0.01565086076713983,0.9075630385563792,0.759389481241052
58,mmlu-pro_all,500,500,300,Random (uniform),0.008362426661827864,0.0070335246149199485,0.9541685964428458,0.8807710121010884
59,mmlu-pro_all,500,500,300,Random (uniform),0.011605395270065934,0.01270431893687708,0.8440722199302098,0.8333333333333333
60,mmlu-pro_all,1000,1000,300,Random (uniform),0.010475594266013421,0.008603141045001516,0.8181818181818182,0.8181818181818182
61,mmlu-pro_all,1000,1000,300,Random (uniform),0.007467368293715599,0.012947145877378424,0.8440722199302098,0.8807710121010884
62,mmlu-pro_all,1000,1000,300,Random (uniform),0.0035268856226940096,0.008739353669586227,0.9174698042719671,0.8440722199302098
63,mmlu-pro_all,1000,1000,300,Random (uniform),0.01141498820540738,0.020512231954092436,0.9636363636363636,0.9272727272727274
64,mmlu-pro_all,1000,1000,300,Random (uniform),0.007002661343978692,0.011812443370582904,0.9272727272727274,0.9272727272727274
65,mmlu-pro_all,1000,1000,300,Random (uniform),0.012222161737131781,0.016582603443068552,0.6363636363636364,0.6000000000000001
66,mmlu-pro_all,1000,1000,300,Random (uniform),0.012763926692070405,0.015295077016007248,0.9446064278852111,0.8705196492275474
67,mmlu-pro_all,1000,1000,300,Random (uniform),0.008981491562329892,0.007770764119601309,0.9174698042719671,0.7706746355884524
68,mmlu-pro_all,1000,1000,300,Random (uniform),0.027630557067682818,0.030495922681969212,0.7706746355884524,0.8807710121010884
69,mmlu-pro_all,1000,1000,300,Random (uniform),0.02027073126474324,0.026944125641800067,0.8545454545454545,0.7706746355884524
70,mmlu-pro_all,10,10,300,Anchor Points,0.1314957462465795,0.1261121913991617,0.6363636363636364,0.7090909090909091
71,mmlu-pro_all,10,10,300,Anchor Points,0.056564617799295065,0.05297896686849437,0.38181818181818183,0.4909090909090909
72,mmlu-pro_all,10,10,300,Anchor Points,0.05260428757037678,0.055460891889172984,0.2727272727272727,0.2
73,mmlu-pro_all,10,10,300,Anchor Points,0.1579777234434808,0.16924493029626067,0.8545454545454545,0.8909090909090909
74,mmlu-pro_all,10,10,300,Anchor Points,0.16197640722068973,0.153115462041683,0.8545454545454545,0.8545454545454545
75,mmlu-pro_all,10,10,300,Anchor Points,0.1339459119834643,0.13010370211145358,0.9272727272727274,0.8909090909090909
76,mmlu-pro_all,10,10,300,Anchor Points,0.13397521883194566,0.13176321276253813,0.7818181818181819,0.8181818181818182
77,mmlu-pro_all,10,10,300,Anchor Points,0.1517736915032253,0.14479469670916295,0.7090909090909091,0.7090909090909091
78,mmlu-pro_all,10,10,300,Anchor Points,0.1259801044247414,0.12884547003902777,0.34545454545454546,0.38181818181818183
79,mmlu-pro_all,10,10,300,Anchor Points,0.14179321400495581,0.14846660838201264,0.8545454545454545,0.7706746355884524
80,mmlu-pro_all,25,25,300,Anchor Points,0.08895823851531098,0.08437219049649097,0.7090909090909091,0.7818181818181819
81,mmlu-pro_all,25,25,300,Anchor Points,0.09650883845461519,0.08946124030549917,0.8545454545454545,0.7454545454545454
82,mmlu-pro_all,25,25,300,Anchor Points,0.07505703241139615,0.08482127519332447,0.7454545454545454,0.6727272727272727
83,mmlu-pro_all,25,25,300,Anchor Points,0.10187047814726231,0.1131376850000422,0.7090909090909091,0.7454545454545454
84,mmlu-pro_all,25,25,300,Anchor Points,0.10230670255057096,0.09344575737156421,0.7090909090909091,0.7090909090909091
85,mmlu-pro_all,25,25,300,Anchor Points,0.07436781463512397,0.07064468604080755,0.7454545454545454,0.7090909090909091
86,mmlu-pro_all,25,25,300,Anchor Points,0.07369363184878588,0.07148162577937833,0.7090909090909091,0.7454545454545454
87,mmlu-pro_all,25,25,300,Anchor Points,0.08925164791315467,0.08227265311909235,0.7454545454545454,0.8909090909090909
88,mmlu-pro_all,25,25,300,Anchor Points,0.07640227842902657,0.07926764404331298,0.6363636363636364,0.6727272727272727
89,mmlu-pro_all,25,25,300,Anchor Points,0.08061397362381079,0.08728736800086762,0.8545454545454545,0.8440722199302099
90,mmlu-pro_all,50,50,300,Anchor Points,0.08640937566225514,0.08102582081483732,0.7454545454545454,0.8181818181818182
91,mmlu-pro_all,50,50,300,Anchor Points,0.06863826773886285,0.06159066958974682,0.9636363636363636,0.8545454545454545
92,mmlu-pro_all,50,50,300,Anchor Points,0.07356341530758659,0.08492128951186101,0.7818181818181819,0.7090909090909091
93,mmlu-pro_all,50,50,300,Anchor Points,0.06257293087711623,0.0738401377298961,0.7818181818181819,0.8181818181818182
94,mmlu-pro_all,50,50,300,Anchor Points,0.06487540148982655,0.056014456310819805,0.8181818181818182,0.8181818181818182
95,mmlu-pro_all,50,50,300,Anchor Points,0.06881029028939199,0.06496808041738128,0.8181818181818182,0.7818181818181819
96,mmlu-pro_all,50,50,300,Anchor Points,0.06626962584226503,0.0651781996556882,0.8181818181818182,0.8545454545454545
97,mmlu-pro_all,50,50,300,Anchor Points,0.055236099677871776,0.048682027715082364,0.8181818181818182,0.8181818181818182
98,mmlu-pro_all,50,50,300,Anchor Points,0.04580832036500784,0.048673685979294244,0.6363636363636364,0.7454545454545454
99,mmlu-pro_all,50,50,300,Anchor Points,0.04840421797572269,0.05293997682146621,0.8545454545454545,0.8440722199302099
100,mmlu-pro_all,100,100,300,Anchor Points,0.06690078749398903,0.06285746971999308,0.7454545454545454,0.8181818181818182
101,mmlu-pro_all,100,100,300,Anchor Points,0.060911778950479896,0.053864180801363834,0.8909090909090909,0.7818181818181819
102,mmlu-pro_all,100,100,300,Anchor Points,0.055533951152607314,0.06689182535688175,0.8909090909090909,0.8181818181818182
103,mmlu-pro_all,100,100,300,Anchor Points,0.049846846704452656,0.061114053557232546,0.8181818181818182,0.8545454545454545
104,mmlu-pro_all,100,100,300,Anchor Points,0.047878212667290934,0.04044653594189889,0.8545454545454545,0.8545454545454545
105,mmlu-pro_all,100,100,300,Anchor Points,0.055638984723943345,0.0525935271758565,0.8181818181818182,0.8545454545454545
106,mmlu-pro_all,100,100,300,Anchor Points,0.04920299639578421,0.04699099032637666,0.8181818181818182,0.8545454545454545
107,mmlu-pro_all,100,100,300,Anchor Points,0.0475920184933428,0.04136042650767757,0.7818181818181819,0.9272727272727274
108,mmlu-pro_all,100,100,300,Anchor Points,0.05452276505608138,0.05738813067036777,0.7454545454545454,0.7818181818181819
109,mmlu-pro_all,100,100,300,Anchor Points,0.048505238540971066,0.055178632918027895,0.8545454545454545,0.8440722199302099
110,mmlu-pro_all,250,250,300,Anchor Points,0.049104822631071146,0.043378315166441896,0.8545454545454545,0.9272727272727274
111,mmlu-pro_all,250,250,300,Anchor Points,0.04070213220264079,0.035478330334917464,0.9272727272727274,0.8181818181818182
112,mmlu-pro_all,250,250,300,Anchor Points,0.04337293592383584,0.054461584955515896,0.8181818181818182,0.7454545454545454
113,mmlu-pro_all,250,250,300,Anchor Points,0.03667865051979345,0.0466488345215391,0.8545454545454545,0.8909090909090909
114,mmlu-pro_all,250,250,300,Anchor Points,0.04337299362961865,0.03669851413798277,0.8545454545454545,0.8545454545454545
115,mmlu-pro_all,250,250,300,Anchor Points,0.03855965258468564,0.03569501038011864,0.8545454545454545,0.8909090909090909
116,mmlu-pro_all,250,250,300,Anchor Points,0.03222526423790065,0.03044682172696962,0.8181818181818182,0.8545454545454545
117,mmlu-pro_all,250,250,300,Anchor Points,0.04432758856689857,0.03916681899803097,0.8181818181818182,0.8181818181818182
118,mmlu-pro_all,250,250,300,Anchor Points,0.03300772233390559,0.033042626852865256,0.8181818181818182,0.9272727272727274
119,mmlu-pro_all,250,250,300,Anchor Points,0.03897586819939931,0.045659711827958166,0.8545454545454545,0.8440722199302099
120,mmlu-pro_all,500,500,300,Anchor Points,0.03965520454534565,0.033917621019153306,0.8181818181818182,0.8909090909090909
121,mmlu-pro_all,500,500,300,Anchor Points,0.03254290708946147,0.027428438436472656,0.9272727272727274,0.8181818181818182
122,mmlu-pro_all,500,500,300,Anchor Points,0.027266401890401106,0.03754671712824314,0.8909090909090909,0.8181818181818182
123,mmlu-pro_all,500,500,300,Anchor Points,0.03290045910381812,0.04151853956426037,0.8545454545454545,0.8909090909090909
124,mmlu-pro_all,500,500,300,Anchor Points,0.0298474877694026,0.02637736520950146,0.8909090909090909,0.8909090909090909
125,mmlu-pro_all,500,500,300,Anchor Points,0.031613348967578295,0.028748706763011292,0.8909090909090909,0.9272727272727274
126,mmlu-pro_all,500,500,300,Anchor Points,0.02982240812699801,0.028043965616066983,0.8545454545454545,0.8909090909090909
127,mmlu-pro_all,500,500,300,Anchor Points,0.03627366148600206,0.03204720048471861,0.8181818181818182,0.8181818181818182
128,mmlu-pro_all,500,500,300,Anchor Points,0.02676615960093679,0.026801064119896467,0.7454545454545454,0.8545454545454545
129,mmlu-pro_all,500,500,300,Anchor Points,0.02707702847233431,0.03204885772644736,0.8181818181818182,0.8073734277593311
130,mmlu-pro_all,1000,1000,300,Anchor Points,0.027360740760332403,0.023317422986336465,0.8545454545454545,0.9272727272727274
131,mmlu-pro_all,1000,1000,300,Anchor Points,0.024903413992474796,0.021358872226053673,0.9272727272727274,0.8181818181818182
132,mmlu-pro_all,1000,1000,300,Anchor Points,0.021976244077963083,0.032256559315805115,0.8545454545454545,0.7818181818181819
133,mmlu-pro_all,1000,1000,300,Anchor Points,0.026612530808898974,0.03523061126934122,0.8181818181818182,0.8545454545454545
134,mmlu-pro_all,1000,1000,300,Anchor Points,0.025675949915220486,0.021755178548395763,0.8909090909090909,0.8909090909090909
135,mmlu-pro_all,1000,1000,300,Anchor Points,0.025498548046863877,0.022752987119991165,0.8545454545454545,0.8909090909090909
136,mmlu-pro_all,1000,1000,300,Anchor Points,0.02771811592632117,0.025939673415390142,0.8181818181818182,0.8545454545454545
137,mmlu-pro_all,1000,1000,300,Anchor Points,0.024812653902234572,0.02067012852311263,0.8181818181818182,0.8181818181818182
138,mmlu-pro_all,1000,1000,300,Anchor Points,0.021869160832469526,0.022652894884550617,0.7818181818181819,0.8909090909090909
139,mmlu-pro_all,1000,1000,300,Anchor Points,0.023347855348429734,0.028401610575397217,0.8909090909090909,0.8807710121010884
140,mmlu-pro_all,10,10,300,tinyBenchmarks,0.13241698421339135,0.14043549867947577,-0.03669879217087869,0.03669879217087869
141,mmlu-pro_all,25,25,300,tinyBenchmarks,0.0590636908002177,0.06410887075139958,0.3302891295379082,0.2568915451961508
142,mmlu-pro_all,50,50,300,tinyBenchmarks,0.032646827557007235,0.0299541961079352,0.4403855060505442,0.4403855060505442
143,mmlu-pro_all,100,100,300,tinyBenchmarks,0.03902800459686687,0.04476558812305921,0.2201927530252721,0.2201927530252721
144,mmlu-pro_all,250,250,300,tinyBenchmarks,0.02526764652513153,0.03234546712474574,0.5504818825631803,0.5504818825631803
145,mmlu-pro_all,500,500,300,tinyBenchmarks,0.01817576967277569,0.02283418665538166,0.6727272727272727,0.7454545454545454
146,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009602008105000587,0.01656203348864406,0.8181818181818182,0.8909090909090909
147,mmlu-pro_all,10,10,300,tinyBenchmarks,0.11539043125869465,0.12285933519048677,0.03739787960033829,0.07479575920067658
148,mmlu-pro_all,25,25,300,tinyBenchmarks,0.044925300913324834,0.051733289687613565,0.3209833376209784,0.28322059201851035
149,mmlu-pro_all,50,50,300,tinyBenchmarks,0.034476501542369725,0.041524099691485766,0.5504818825631803,0.4403855060505442
150,mmlu-pro_all,100,100,300,tinyBenchmarks,0.045530151817576935,0.04988616344277112,0.697277051246695,0.587180674734059
151,mmlu-pro_all,250,250,300,tinyBenchmarks,0.02284824290812315,0.02126130302568124,0.5636363636363636,0.5272727272727272
152,mmlu-pro_all,500,500,300,tinyBenchmarks,0.020564930744571386,0.01956795158299123,0.7090909090909091,0.7454545454545454
153,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.012626262626263492,0.013616932047513113,0.7818181818181819,0.6727272727272727
154,mmlu-pro_all,10,10,300,tinyBenchmarks,0.17198935462408516,0.16631611437348798,0.5000857559392293,0.4259989772815657
155,mmlu-pro_all,25,25,300,tinyBenchmarks,0.11918587068287664,0.11043351734063332,0.8073734277593311,0.7339758434175737
156,mmlu-pro_all,50,50,300,tinyBenchmarks,0.034960382265771466,0.03005148466783335,0.587180674734059,0.5137830903923016
157,mmlu-pro_all,100,100,300,tinyBenchmarks,0.029577209217927825,0.03157595008312183,0.7454545454545454,0.8181818181818182
158,mmlu-pro_all,250,250,300,tinyBenchmarks,0.015060787515877288,0.016029774740260697,0.7454545454545454,0.8181818181818182
159,mmlu-pro_all,500,500,300,tinyBenchmarks,0.011885320268553807,0.014295399938067903,0.7090909090909091,0.7090909090909091
160,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.012596020081048438,0.0068502580060859,0.8181818181818182,0.7454545454545454
161,mmlu-pro_all,10,10,300,tinyBenchmarks,0.14251799431440143,0.13125078746162155,0.4430456239044162,0.4430456239044162
162,mmlu-pro_all,25,25,300,tinyBenchmarks,0.07416984213391392,0.06489374015484581,0.38895558795273394,0.4259989772815657
163,mmlu-pro_all,50,50,300,tinyBenchmarks,0.029607451763140346,0.02864515607866114,0.7706746355884524,0.7339758434175737
164,mmlu-pro_all,100,100,300,tinyBenchmarks,0.06663944837597534,0.05537224152319544,0.6727272727272727,0.7090909090909091
165,mmlu-pro_all,250,250,300,tinyBenchmarks,0.019793745841650032,0.01995528725092852,0.7454545454545454,0.7090909090909091
166,mmlu-pro_all,500,500,300,tinyBenchmarks,0.008105002116978443,0.012032252820443574,0.9272727272727274,0.8909090909090909
167,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.010584890824411237,0.010235504024670451,0.8909090909090909,0.8545454545454545
168,mmlu-pro_all,10,10,300,tinyBenchmarks,0.07358011250226819,0.07825974950330888,0.4861724348043977,0.44877455520405946
169,mmlu-pro_all,25,25,300,tinyBenchmarks,0.09443234742635932,0.10144156467091983,0.14679516868351475,0.14679516868351475
170,mmlu-pro_all,50,50,300,tinyBenchmarks,0.08705316639448359,0.09591411157349032,0.7818181818181819,0.7090909090909091
171,mmlu-pro_all,100,100,300,tinyBenchmarks,0.033675074094235764,0.037694539442206895,0.5636363636363636,0.5636363636363636
172,mmlu-pro_all,250,250,300,tinyBenchmarks,0.01861428657835847,0.016834336963845342,0.7454545454545454,0.7454545454545454
173,mmlu-pro_all,500,500,300,tinyBenchmarks,0.008286457388253813,0.007382838269901359,0.8181818181818182,0.8909090909090909
174,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.005655355954758203,0.010778523492628866,0.9636363636363636,0.9636363636363636
175,mmlu-pro_all,10,10,300,tinyBenchmarks,0.08920038710457866,0.09445489369594641,0.1005037815259212,0.1005037815259212
176,mmlu-pro_all,25,25,300,tinyBenchmarks,0.08478497550353832,0.08822820482060134,0.6605782590758164,0.6238794669049376
177,mmlu-pro_all,50,50,300,tinyBenchmarks,0.0623147644105728,0.06478042606019209,0.5504818825631803,0.587180674734059
178,mmlu-pro_all,100,100,300,tinyBenchmarks,0.03440089517933835,0.031549294444434446,0.8440722199302098,0.8807710121010884
179,mmlu-pro_all,250,250,300,tinyBenchmarks,0.01347305389221561,0.014005704487531538,0.7090909090909091,0.7454545454545454
180,mmlu-pro_all,500,500,300,tinyBenchmarks,0.016058791507893452,0.018865500650767576,0.8545454545454545,0.8909090909090909
181,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.011356075727332549,0.012002140890873519,0.8545454545454545,0.9636363636363636
182,mmlu-pro_all,10,10,300,tinyBenchmarks,0.05212302667392485,0.05265362760836927,0.8227533512074424,0.8601512308077806
183,mmlu-pro_all,25,25,300,tinyBenchmarks,0.03799975805963825,0.03789647122748357,0.37397879600338285,0.44877455520405946
184,mmlu-pro_all,50,50,300,tinyBenchmarks,0.038740700417347146,0.038809404245973,0.7339758434175737,0.8440722199302098
185,mmlu-pro_all,100,100,300,tinyBenchmarks,0.024950099800399205,0.02486505896894424,0.6727272727272727,0.7818181818181819
186,mmlu-pro_all,250,250,300,tinyBenchmarks,0.014743240791144903,0.016955246860552456,0.8181818181818182,0.8545454545454545
187,mmlu-pro_all,500,500,300,tinyBenchmarks,0.011718986269884324,0.01071013113207781,0.8181818181818182,0.7090909090909091
188,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.009072763563780402,0.01246193809903487,0.9272727272727274,0.8909090909090909
189,mmlu-pro_all,10,10,300,tinyBenchmarks,0.13022439968547767,0.12434462595999998,0.4670993664969138,0.5449492609130662
190,mmlu-pro_all,25,25,300,tinyBenchmarks,0.03529305026311011,0.041635424409411266,0.5983660736054126,0.44877455520405946
191,mmlu-pro_all,50,50,300,tinyBenchmarks,0.032767797737857596,0.03868844411247973,0.6238794669049376,0.4770842982214229
192,mmlu-pro_all,100,100,300,tinyBenchmarks,0.01872013548660251,0.02315889714580701,0.6363636363636364,0.5636363636363636
193,mmlu-pro_all,250,250,300,tinyBenchmarks,0.021668783644831578,0.019535990939695175,0.6238794669049376,0.697277051246695
194,mmlu-pro_all,500,500,300,tinyBenchmarks,0.008180608480009814,0.007066838838179838,0.8545454545454545,0.8545454545454545
195,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.014622270610295989,0.009957503697931434,0.8181818181818182,0.8181818181818182
196,mmlu-pro_all,10,10,300,tinyBenchmarks,0.1504566624327103,0.1475912968184239,-0.14959151840135315,-0.07479575920067658
197,mmlu-pro_all,25,25,300,tinyBenchmarks,0.03908848968729208,0.0394789902727516,0.5371291452680611,0.46304236661039755
198,mmlu-pro_all,50,50,300,tinyBenchmarks,0.03992015968063869,0.03581396554520072,0.6727272727272727,0.6363636363636364
199,mmlu-pro_all,100,100,300,tinyBenchmarks,0.04116010403435557,0.03704388263641851,0.6605782590758164,0.7706746355884524
200,mmlu-pro_all,250,250,300,tinyBenchmarks,0.02126050928446136,0.024204184000869686,0.7090909090909091,0.8181818181818182
201,mmlu-pro_all,500,500,300,tinyBenchmarks,0.010267344099679547,0.009182370183623596,0.7818181818181819,0.7454545454545454
202,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.007076755579748681,0.007233715394137813,0.8909090909090909,0.8545454545454545
203,mmlu-pro_all,10,10,300,tinyBenchmarks,0.08395330551019174,0.08557995254834135,0.6617241025372945,0.628539361054709
204,mmlu-pro_all,25,25,300,tinyBenchmarks,0.05490534083348455,0.05107514760070116,0.697277051246695,0.6851851851851851
205,mmlu-pro_all,50,50,300,tinyBenchmarks,0.04533357527369506,0.04368884299256118,0.14679516868351475,0.05555555555555555
206,mmlu-pro_all,100,100,300,tinyBenchmarks,0.04448678400774214,0.042214915784056624,0.587180674734059,0.49999999999999994
207,mmlu-pro_all,250,250,300,tinyBenchmarks,0.022046815459989114,0.025672062700331722,0.6727272727272727,0.587180674734059
208,mmlu-pro_all,500,500,300,tinyBenchmarks,0.0125960200810502,0.014777301332098534,0.8545454545454545,0.7706746355884524
209,mmlu-pro_all,1000,1000,300,tinyBenchmarks,0.01094780136696316,0.015850911687146727,0.8545454545454545,0.7706746355884524
210,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.061846004959777406,0.0604658195940747,0.3659020326817838,0.40655781409087083
211,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.10760297586644892,0.11193937505036239,0.37397879600338285,0.261785157202368
212,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.10699812496219682,0.11526324176372917,0.6482593132545565,0.574172534596893
213,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.15620274602310527,0.1490862029087903,0.574172534596893,0.574172534596893
214,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.1986859614105123,0.20754690658951905,0.3819143697985006,0.34171285718813205
215,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.16597108812677674,0.1696942167210932,0.49580054640770793,0.49580054640770793
216,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.0863349059456844,0.08837711167818396,0.7395739969534469,0.778498944161523
217,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.07527369503417408,0.07996291721399644,0.5427204202399745,0.6231234454607115
218,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.09763805721889555,0.10243808165406829,-0.059005294323649496,-0.059005294323649496
219,mmlu-pro_all,10,10,300,Stratified sampling (confidence),0.12274693038166093,0.11893239093141615,0.40451991747794525,0.46656947481584343
220,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.05881116554769248,0.055741858676686684,0.46304236661039755,0.46304236661039755
221,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.0490413113167604,0.05556723058463761,0.5853225568382547,0.6230853024407229
222,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.07835717453980927,0.06699930033553486,0.9349469900084573,0.8601512308077806
223,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.06586322304885178,0.05997983897301542,0.07339758434175737,0.11009637651263605
224,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.04357194701506076,0.039989561800590885,0.5504818825631803,0.5504818825631803
225,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.05033241597612854,0.04862581206927399,0.6731618328060892,0.5983660736054126
226,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.0572430895784189,0.055031083509011355,0.6112159239257249,0.5000857559392293
227,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.0763594024073066,0.08171122770064422,0.587180674734059,0.587180674734059
228,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.04968094114800701,0.04996926011060532,0.3302891295379082,0.40368671387966554
229,mmlu-pro_all,25,25,300,Stratified sampling (confidence),0.062077360430653844,0.06145058619296191,0.1272727272727273,0.03669879217087869
230,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.036981772385964014,0.042162189765239486,0.6605782590758164,0.7339758434175737
231,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.026345121205400657,0.02627274785864584,0.8073734277593311,0.7706746355884524
232,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03137458848536694,0.02767104051570006,0.6727272727272727,0.6727272727272727
233,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.04281689146958607,0.041964775104309966,0.697277051246695,0.6605782590758164
234,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.043070130782206624,0.041015953344493934,0.5272727272727272,0.5272727272727272
235,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.05076091803886214,0.047245548697006236,0.6238794669049376,0.587180674734059
236,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.058484294038186314,0.05968541540363019,0.6727272727272727,0.6363636363636364
237,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.026810688387534753,0.03196614813149191,0.6238794669049376,0.4770842982214229
238,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.03241649817248618,0.036341417889078374,0.45454545454545453,0.41818181818181815
239,mmlu-pro_all,50,50,300,Stratified sampling (confidence),0.05174764108147345,0.04847199330993373,0.8807710121010884,0.7962962962962963
240,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.03271031480918529,0.03742588115962144,0.7090909090909091,0.7090909090909091
241,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.02159969178100112,0.02882898633796591,0.6727272727272727,0.6363636363636364
242,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.028690581858110555,0.03812647700630203,0.8909090909090909,0.8181818181818182
243,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.025838726587229497,0.027284393929011753,0.7090909090909091,0.6727272727272727
244,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.019506567672735314,0.01720908789823288,0.4909090909090909,0.4909090909090909
245,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.03182038120690212,0.035454831826452295,0.6605782590758164,0.6238794669049376
246,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.030689908113679023,0.027806618014123777,0.6363636363636364,0.7454545454545454
247,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.07113312456376324,0.064901532578098,0.6000000000000001,0.7454545454545454
248,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.042868114495447994,0.04054494446583039,0.6363636363636364,0.6000000000000001
249,mmlu-pro_all,100,100,300,Stratified sampling (confidence),0.023175523678793396,0.020455140905374133,0.7090909090909091,0.6238794669049377
250,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.020654088177564416,0.016873616279548444,0.8909090909090909,0.9636363636363636
251,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.026223528266842964,0.0322132200802362,0.6000000000000001,0.5636363636363636
252,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.018046049555399454,0.026783271577679256,0.7818181818181819,0.7090909090909091
253,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.014015306850641085,0.011808527341569026,0.7818181818181819,0.7454545454545454
254,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.013687161892223977,0.014122992940797517,0.7818181818181819,0.7818181818181819
255,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.014996030398398641,0.018392194155225274,0.6727272727272727,0.6363636363636364
256,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01485091809239724,0.01705073047431009,0.8545454545454545,0.8909090909090909
257,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.02009483308081764,0.023292548339446505,0.7454545454545454,0.8181818181818182
258,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.01703347244991207,0.01686804354956409,0.8181818181818182,0.8545454545454545
259,mmlu-pro_all,250,250,300,Stratified sampling (confidence),0.011632944962713925,0.013624472452056762,0.8440722199302098,0.8703703703703703
260,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.022732022851641383,0.021802903540681106,0.6000000000000001,0.6727272727272727
261,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.009607452685139432,0.01712734801893906,0.8181818181818182,0.7090909090909091
262,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.018832183305513547,0.010305111513989494,0.9636363636363636,0.8909090909090909
263,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.011984886263100729,0.008154519535887667,0.8909090909090909,0.8545454545454545
264,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.014158901231589095,0.011485178495853095,0.7818181818181819,0.8545454545454545
265,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.009914918535708622,0.013477229130465935,0.8545454545454545,0.8909090909090909
266,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.014965563067087472,0.01484376900294555,0.8545454545454545,0.8909090909090909
267,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.011979570972294528,0.006703607137729984,0.8909090909090909,0.8909090909090909
268,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.02119353306890503,0.024058898683191422,0.7454545454545454,0.8545454545454545
269,mmlu-pro_all,500,500,300,Stratified sampling (confidence),0.014733061606711588,0.019819979708525302,0.7454545454545454,0.6605782590758164
270,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.009440752209152808,0.009123477628157888,0.8181818181818182,0.8909090909090909
271,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.009986546759639614,0.006807277548684396,0.9272727272727274,0.9636363636363636
272,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.01614413572961677,0.010594140626210093,0.7818181818181819,0.8545454545454545
273,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.008000479585434402,0.009879582609292437,0.8545454545454545,0.8181818181818182
274,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.009163066334969838,0.00930294137720654,0.9636363636363636,0.8909090909090909
275,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.013326597699374599,0.010381543410547823,0.8181818181818182,0.8545454545454545
276,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.005558249522782186,0.007970466171859108,0.8545454545454545,0.9636363636363636
277,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.006878362279303218,0.010598480093263822,0.8909090909090909,0.7454545454545454
278,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.004900308074809986,0.007204903207769683,0.8181818181818182,0.7818181818181819
279,mmlu-pro_all,1000,1000,300,Stratified sampling (confidence),0.012224385127479932,0.006725663820751417,1.0,0.9174698042719671
