model_a,mean,std,lower,upper
gpt-4o-2024-05-13,4.950067845011204,3.430461770178429,-5.432037393258821,5.613898759477927
claude-3-5-sonnet-20240620,10.988339171473088,5.175960572122458,-8.80643594170463,8.474269471622684
gemini-advanced-0514,-7.745547046230735,3.7635275624751614,-6.134848730086498,6.191777205646092
gemini-1.5-pro-api-0514,4.752729174080466,3.731777917192374,-5.824684587815147,6.096598865646669
gpt-4-turbo-2024-04-09,4.50989139677323,2.917745589798487,-4.686658870864149,5.080575389237276
gpt-4-1106-preview,0.3572551540890158,3.0748435831618623,-5.212139199698955,4.92325920455277
gemini-1.5-pro-api-0409-preview,-15.235421756439077,3.4324866493390176,-5.767474771686192,5.494764433901171
claude-3-opus-20240229,2.7256798315227497,2.668987360326496,-4.329945978646179,4.2902151143984995
gpt-4-0125-preview,2.0383725316127674,3.056128679063514,-5.002038866876029,5.066878257324392
yi-large-preview,6.748251423631459,3.61444688951118,-5.9932450013879155,6.086486149423829
gemini-1.5-flash-api-0514,1.3035848410392585,3.6514558924993588,-6.277326531694058,6.018283816188051
yi-large,5.721752547041355,6.2380414878054715,-10.224054592121325,10.24265258585266
gemma-2-27b-it,-17.852472021347786,7.4578846696425485,-12.479338616786686,11.683085237749673
bard-jan-24-gemini-pro,-50.699473311811424,6.686624823185285,-10.614885050066832,10.64351560470373
glm-4-0520,10.308475768015011,6.388265559636441,-10.88088568338344,10.661668813398897
nemotron-4-340b-instruct,-4.134899420027063,5.0731134076029445,-8.180409285864805,8.343202760300647
llama-3-70b-instruct,-4.643957181786697,2.7178541782929466,-4.523760307441475,4.483079638270124
claude-3-sonnet-20240229,1.2967069564172216,2.8062924468409975,-4.745151811914234,4.441562369659208
reka-core-20240501,-6.210539404469287,3.1900008556207107,-5.828612883300901,5.222045900778809
command-r-plus,-19.778161638526342,3.0205876435023624,-4.848092055079189,5.051196344640335
gpt-4-0314,8.922017689752623,3.463544919678956,-5.605846598279605,5.533527850467611
qwen-max-0428,3.145112285871934,4.284861660316629,-7.0752104425538285,7.1070096316140585
qwen2-72b-instruct,1.4894298954349543,4.191989533044673,-6.920888800205512,7.192936492154148
claude-3-haiku-20240307,0.7165887550287675,2.8193262919151247,-4.598697051822083,4.698705994368465
gemma-2-9b-it,-10.464285615964487,7.837536243113134,-13.253440406657791,12.706174400455842
deepseek-coder-v2,37.10665072490103,6.270478510476377,-10.779870767380672,10.34274682723228
glm-4-0116,14.347490709045802,6.651789547925323,-11.277709767270819,10.567739373550875
qwen1.5-110b-chat,3.8132150996931666,4.013839215345182,-6.40817751419684,6.331401073975092
gpt-4-0613,3.3000914802145536,3.0444420541947417,-4.957756299835952,4.837265131365894
reka-flash-preview-20240611,-13.976923715022988,5.173518541131804,-8.500667386344535,8.609308413652453
yi-1.5-34b-chat,2.4655068561728752,4.477213406872314,-7.968409575382496,7.016237149196901
reka-flash-21b-20240226-online,-12.329321020487695,5.2026546594257725,-8.786563554710213,8.478459153014791
mistral-large-2402,9.85433088977878,3.2612815320657935,-5.289102108402314,5.5229290705914735
llama-3-8b-instruct,-16.49547061760792,2.845533673010795,-4.582549306010037,4.552304870790529
qwen1.5-72b-chat,-3.555295119731516,3.479461949824625,-5.932703094170113,5.811048893403024
claude-1,-24.64372387809049,4.966283299882161,-8.485564087537366,7.899319246756498
command-r,-30.147903566523958,3.328440356990718,-5.550480688441102,5.4537308291525335
reka-flash-21b-20240226,-13.428791734567506,4.056540203421853,-6.311351370249518,7.066774117590887
mistral-medium,-1.4961496028488004,3.7596560382990782,-6.30791854774736,5.98954356910657
mixtral-8x22b-instruct-v0.1,3.0038572058373476,3.6566247749857497,-6.170249078124092,5.939916230306483
gemini-pro-dev-api,-27.21928820522223,5.340387259599083,-8.86297314451684,8.783172041993613
claude-2.0,-5.49575345692334,6.297740043887253,-10.108822769596141,10.63976931360078
qwen1.5-32b-chat,3.915307557984333,4.429321868376348,-7.3865612962162075,7.546088331684864
zephyr-orpo-141b-A35b-v0.1,-11.36441006057752,8.72614882508901,-15.068079073331898,14.494746652897195
mistral-next,2.8337589363616544,5.780240982492167,-9.074616559618264,9.810647608533532
phi-3-medium-4k-instruct,9.222607995699448,5.252824694072953,-8.989704065817985,8.79097795038289
gpt-3.5-turbo-0613,4.0670869403191325,4.087847167571205,-6.568792761925556,6.837380204642239
qwen1.5-14b-chat,1.057149957441313,4.684214888389694,-7.368020680366186,7.370219338459596
starling-lm-7b-beta,-0.9700402607060157,4.885777838672899,-8.052581101826672,7.830074904645446
claude-2.1,1.894923082249148,3.8956003793785605,-6.482173622256992,6.411138040930565
yi-34b-chat,-12.404029935727642,5.42992936094135,-9.160012882527965,8.767637408220267
gemini-pro,-29.740056766291133,8.427095165120754,-13.48292767074419,14.401882658167011
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,1.1096695001126133,3.106818294410173,-5.053080431284876,5.384543982745162
claude-instant-1,-3.687627199469267,5.2729492668197615,-8.748131341257105,8.722250223323687
wizardlm-70b,-31.20070848563853,7.7424179039234025,-12.19277928628544,12.889040921201136
gpt-3.5-turbo-0314,8.539264702088044,11.738847860101489,-20.61154209009409,18.35138380520848
dbrx-instruct-preview,1.9514826349992411,3.76581995484311,-6.3691273187633435,6.201842769663006
phi-3-small-8k-instruct,2.7654890654108177,5.079748266872418,-8.14802823433076,8.562376992832869
tulu-2-dpo-70b,-4.665519112030162,8.296271004085046,-13.918120280722661,14.320118411249624
snowflake-arctic-instruct,-22.72937373964038,3.8090614854606732,-6.1197848070417535,6.2182223424008605
openchat-3.5-0106,-12.639771929713849,5.3626468249831,-8.48056179211541,8.718586238499263
llama-2-70b-chat,-24.628664367077572,3.6320588219386454,-5.788973064210278,6.023191062472058
vicuna-33b,-27.358117601892324,4.852337865311525,-8.20521995980057,7.808244472876964
starling-lm-7b-alpha,-17.44977526251612,6.377275835615735,-10.551619945854554,10.76254837068636
gemma-1.1-7b-it,-12.600576778055084,4.363294870530075,-7.1483826900979315,7.241921014028447
nous-hermes-2-mixtral-8x7b-dpo,-35.164831858557456,9.558025948437534,-15.538357098853858,15.844802520400698
llama2-70b-steerlm-chat,-35.21039663486776,11.72401432155005,-20.041583941493258,19.933641697458782
openchat-3.5,-13.721319911094028,7.632712541922624,-12.687964467121011,12.081585573716588
deepseek-llm-67b-chat,-14.472563107968847,9.351240566429722,-15.532320783940605,15.26315254121556
openhermes-2.5-mistral-7b,-9.522645564383005,9.030256275833242,-14.82884059215501,15.082407402076875
qwen1.5-7b-chat,-10.09539051927513,8.472093582143465,-13.74234312431716,12.937451431129892
pplx-70b-online,-48.88794282391482,8.14272398781506,-13.695583545284649,13.396338419053691
mistral-7b-instruct-v0.2,-6.1011563476943005,4.618310577866823,-7.366766062767382,7.911792965817031
gpt-3.5-turbo-1106,19.5341014632504,5.4518084238517,-9.277744839676501,8.995505650585919
phi-3-mini-4k-instruct,8.10642416423633,4.395991231400879,-7.132529716283635,7.100433064404081
llama-2-13b-chat,-18.592015571260664,5.217324302364878,-8.650213226816993,9.039911345260599
solar-10.7b-instruct-v1.0,-9.456985147711366,10.226893281743443,-16.65333846037919,16.96608769164903
dolphin-2.2.1-mistral-7b,-14.580750195706702,15.985794004571998,-26.583435779179958,27.257058234340303
wizardlm-13b,-49.04002267462935,8.780573757344365,-14.262411967887203,15.036532752073207
zephyr-7b-beta,-29.095072498537608,6.912980941796968,-11.511185842468812,11.916150961749434
phi-3-mini-128k-instruct,-15.269137829541387,4.618389967162442,-7.434531688580647,7.606012110170992
vicuna-13b,-23.19688656378102,5.609853695590097,-8.941363758227368,9.18092745979466
mpt-30b-chat,-2.106035832429954,15.15431829266306,-24.983745257580075,25.08490084669086
codellama-34b-instruct,-19.374630690806896,8.198325596474193,-13.231059142859447,14.538029568157853
zephyr-7b-alpha,-26.74129942188834,17.38639201184083,-28.49116843947252,28.177556263298374
codellama-70b-instruct,2.70728142116705,17.94765377024933,-28.61670065908243,29.902746803174562
pplx-7b-online,-27.653858167750986,8.87909206696493,-14.356645506147226,14.737147004449465
gemma-7b-it,3.7818785821167213,6.943139940869435,-11.358487983744487,11.82188585637742
llama-2-7b-chat,-31.8566739602274,5.836173713362487,-9.223703041433893,9.810340281898586
qwen-14b-chat,-16.417224251991353,10.104521855737426,-16.81089261873694,16.277194971398654
falcon-180b-chat,-38.030294651553326,20.03251455414233,-31.727008922771184,32.24858368812668
guanaco-33b,-35.251664705293074,14.137455622992887,-22.970977341054237,23.573994192181353
gemma-1.1-2b-it,-15.198180051209102,6.157306005848236,-10.565340417347887,10.237833745061943
stripedhyena-nous-7b,-26.44971363523101,9.149172339398824,-14.274742145953851,14.53162087183944
olmo-7b-instruct,-25.748254836856425,8.139533632457823,-13.193031730247075,12.480215537661053
mistral-7b-instruct,-10.402854073589635,7.311303900317437,-12.588422069819181,11.599428382074302
palm-2,-11.627431828072977,8.714517737975251,-13.90879032852191,14.372570330817334
vicuna-7b,-19.759253886615582,9.414725884221475,-15.27385571719276,15.550770214549942
qwen1.5-4b-chat,-21.33118711959819,7.133408833524621,-11.47402394617198,11.787657387220165
gemma-2b-it,-11.629118023746353,8.876788675163757,-14.910228335622778,14.839713921150912
koala-13b,-42.53592511014432,10.068069672586198,-16.35735054909713,16.775273546491164
chatglm3-6b,-6.932277322065291,10.431376753349317,-17.020948640068923,17.237196842294495
gpt4all-13b-snoozy,-5.767930574641215,17.257342750753313,-28.526608817260723,28.175466283856938
chatglm2-6b,-18.922784438821278,14.454114830482867,-23.147199990919614,25.10509113444373
mpt-7b-chat,-36.43036699689574,11.923378106560042,-18.45956279385456,20.642259880650123
RWKV-4-Raven-14B,-35.67704099438823,11.361205358948308,-18.214145233533547,18.21670290883217
alpaca-13b,-88.19505019666666,10.807234058803981,-17.684492314107388,17.971059971213066
oasst-pythia-12b,-18.202185536292212,10.498899626894486,-16.44809876230169,18.064994452517254
chatglm-6b,9.221812296633532,11.766796024010716,-19.400825020455997,19.027507575008148
fastchat-t5-3b,-66.48872947671039,11.806198083844466,-19.01127754712912,18.62432677939269
stablelm-tuned-alpha-7b,-16.908893922992842,14.018257641633852,-22.642917985271776,23.83268791548387
dolly-v2-12b,-26.982435031153127,14.166071730651485,-22.33716999972806,23.174929448793552
llama-13b,-78.56076474994575,17.123654203478857,-27.760508671263082,28.660426556657583
