,Human agreement,Price [$/1000 examples],Time [seconds/1000 examples],Spearman corr.,Pearson corr.,Bias,Variance,Proba. prefer longer,Proba. prefer lists,Proba. prefer 1,# parsed,mode
alpaca_eval_gpt4_fn,70.98765432098766,14.471944444444444,5046.056233900002,0.95,0.9447778041206524,27.623456790123456,11.11111111111111,0.750561797752809,0.6756756756756757,0.4799382716049383,2592,verified
improved_aviary_gpt4,69.75308641975309,12.781435185185186,1831.2850013,0.8833333333333333,0.8993690915590962,,,0.7280898876404495,0.7027027027027027,0.4861111111111111,648,verified
alpaca_eval_gpt4,69.1743827160494,13.601944444444444,1455.4169713998845,0.9666666666666668,0.9335485321531084,28.395061728395056,14.621913580246911,0.6831460674157304,0.7297297297297297,0.5011574074074074,2592,minimal
alpaca_eval_clf_cot_gpt4_turbo,68.70109546165884,6.441079812206572,1753.4788411931145,0.9333333333333332,0.7570054666164165,,,0.6863636363636364,0.6545454545454545,0.5352112676056338,639,verified
alpaca_eval_cot_gpt4_turbo_fn,68.63874533448178,6.311349574632637,1988.6012626717545,0.9707197941566388,0.8997919147215918,29.320987654320984,18.435272517819858,0.6696629213483146,0.6126126126126126,0.5232018561484919,2586,minimal
weighted_alpaca_eval_cot_gpt4_turbo,68.45771313115921,6.447465224111284,1869.2926495435856,0.9333333333333332,0.7743167748273401,,,0.6853932584269663,0.6576576576576577,0.5283575514995362,647,verified
aviary_gpt4,68.3641975308642,12.781481481481482,1821.0640311000004,0.9205101496312952,0.9053426857899228,,,0.701123595505618,0.6486486486486487,0.5555555555555556,648,verified
alpaca_eval_gpt4_turbo_fn,68.09413580246913,5.533981481481482,864.3023563021605,0.9333333333333332,0.817290435500228,30.246913580246915,15.625,0.651685393258427,0.6036036036036037,0.5381944444444444,2592,minimal
Self-taught-llama3.1-70B-dpo,68.03937590094094,,206.82500262105583,0.7999999999999999,0.7516559995326958,30.34055727554179,13.015337123801086,0.6567505720823799,0.6146788990825688,0.5172549019607844,2550,minimal
gpt4_turbo_cot_logprob,67.86974910317902,5.397145061728395,1568.9484159171295,0.6333333333333333,0.6310442120964042,,,0.5932584269662922,0.5855855855855856,0.5285319490509259,648,verified
gpt4_turbo_cot_clf,67.59689922480621,5.3972248062015495,1528.4046718706977,0.6666666666666667,0.6326057742256878,,,0.5936794582392777,0.5855855855855856,0.5255813953488373,645,verified
claude_ranking,67.5925925925926,4.954578395061729,218.4230414438272,0.9,0.90848221004591,,,0.7303370786516854,0.6576576576576577,0.4552469135802468,648,verified
alpaca_eval_llama3_70b_fn,67.53091913784353,0.41207197526091993,208.69685160402955,0.9,0.8577236113497642,32.25308641975309,8.204334365325078,0.7910112359550562,0.6576576576576577,0.47931967529957475,2587,minimal
weighted_alpaca_eval_gpt-4o-mini-2024-07-18,0.33674775667807266,12.90736111111111,93.54821923706267,0.9833333333333333,0.9389828560875118,32.24432530355238,14.380747136032564,0.7094594594594594,0.6306306306306306,0.5017959384038282,2592,minimal
gpt4,66.93672839506173,12.452592592592593,1036.788589334915,0.8833333333333333,0.8668599990267735,31.481481481481488,14.621913580246911,0.647191011235955,0.6666666666666666,0.5397376543209877,2592,minimal
alpaca_farm_greedy_gpt4,66.43518518518519,15.28163425925926,877.6250469425926,0.8499999999999999,0.7481465609199582,30.246913580246915,19.290123456790123,0.597752808988764,0.6486486486486487,0.5362654320987654,2592,minimal
weighted_alpaca_eval_gpt4_turbo,65.73198824263118,4.323981481481481,227.7462866895061,0.7833333333333333,0.7688872243700914,33.89896126543981,23.652705035108028,0.6058558558558559,0.5727272727272728,0.5282783420419752,2592,minimal
humans,65.66358024691358,300.0,36800.00000000001,1.0,1.0,0.0,34.336419753086425,0.6359550561797753,0.6036036036036037,0.5177469135802468,2592,minimal
gpt4_turbo_clf,65.58641975308642,3.774166666666667,157.86959398549385,0.5666666666666667,0.6056662735192052,,,0.5123595505617977,0.5405405405405406,0.5555555555555556,648,verified
alpaca_eval_clf_gpt4_turbo,65.42635658914729,4.328077519379845,151.46231159178296,0.7166666666666667,0.7351663293324147,,,0.6049661399548533,0.5909090909090909,0.5271317829457365,645,verified
claude,65.31635802469135,3.298695848765433,172.99865933897803,0.9333333333333332,0.9028603896845376,32.407407407407405,18.47993827160494,0.6606741573033708,0.6666666666666666,0.494212962962963,2592,minimal
lmsys_gpt4,65.25848765432099,13.945289351851851,17981.91908101215,0.9833333333333332,0.9656100250020464,31.59722222222222,15.91435185185185,0.7389277389277389,0.6944444444444444,0.4635416666666667,2592,minimal
gpt4_turbo,64.14219474497682,4.165919629057188,185.73029410061824,0.5666666666666667,0.5688213739495881,,,0.5382882882882883,0.5675675675675675,0.571870170015456,647,verified
text_davinci_003,64.0817901234568,8.712680555439814,120.90134619274691,0.8499999999999999,0.8307147459007311,33.796296296296305,22.72376543209876,0.6966292134831461,0.6576576576576577,0.4733796296296295,2592,minimal
gpt4_turbo_logprob,63.51076045576003,3.774166666666667,142.6550541719136,0.6166666666666666,0.6016102512172834,35.53043431362654,17.968493710574844,0.509009009009009,0.5225225225225225,0.5600111475683258,2592,verified
guanaco_33b,62.74944567627494,,910.8929739450112,0.0,0.2495312789260463,,,0.6991150442477876,0.704225352112676,0.4257206208425721,451,verified
improved_lmsys_gpt4,62.34567901234568,13.938055555555556,5397.837981725772,0.9833333333333332,0.9273862641854697,,,0.7534883720930232,0.7117117117117117,0.4490740740740742,648,verified
longest,62.19135802469136,0.0,0.0,0.2666666666666666,0.5604276915228803,37.808641975308646,0.0,1.0,0.8828828828828829,0.4166666666666667,2592,minimal
chatgpt_fn,59.992283950617285,1.0088333333333337,529.928419875,0.75,0.8270316070156506,36.88271604938272,27.739197530864203,0.6247191011235955,0.6216216216216216,0.4911265432098766,2592,verified
alpaca_farm,57.80525502318392,11.978385883565174,1312.895122694532,0.5272012675161055,0.6048080773927609,,,0.5900900900900901,0.5636363636363636,0.5100463678516229,647,verified
chatgpt,57.28201740503198,0.8342726921591347,284.9753823429895,0.7166666666666667,0.7136212819980075,39.35185185185186,34.054591087228026,0.5910112359550562,0.5945945945945946,0.488991888760139,2589,minimal
cohere,56.60964230171073,6.485108864696734,503.1591360234836,0.2166666666666666,0.4349894801752539,,,0.6281179138321995,0.6486486486486487,0.4603421461897357,643,verified
