model_a,mean,std,lower,upper
gpt-4o-2024-05-13,4.950067845020107,3.4304617701585323,-5.432037393217189,5.613898759330238
claude-3-5-sonnet-20240620,10.988339171461515,5.175960572078357,-8.806435941701821,8.474269472011107
gemini-advanced-0514,-7.745547046214967,3.763527562471771,-6.134848729394668,6.1917772072031925
gemini-1.5-pro-api-0514,4.752729174091735,3.731777917187992,-5.824684587789713,6.0965988656570635
gpt-4-turbo-2024-04-09,4.509891396775161,2.9177455898010747,-4.686658870840891,5.080575389257806
gpt-4-1106-preview,0.35725515408541425,3.074843583215819,-5.212139199722224,4.923259205053654
gemini-1.5-pro-api-0409-preview,-15.235421756438493,3.432486649344061,-5.767474771712916,5.4947644338404995
claude-3-opus-20240229,2.725679831523478,2.6689873603195338,-4.329945978488722,4.2902151142181015
gpt-4-0125-preview,2.0383725316301975,3.056128679050653,-5.0020388661565045,5.066878257529383
yi-large-preview,6.748251423641927,3.6144468895009614,-5.993245001457778,6.0864861494387075
gemini-1.5-flash-api-0514,1.3035848410520214,3.6514558924791203,-6.277326531725792,6.018283816202371
yi-large,5.721752547046253,6.238041487799821,-10.224054592136277,10.242652585679387
gemma-2-27b-it,-17.85247202139885,7.457884669633064,-12.479338616160085,11.683085236553705
bard-jan-24-gemini-pro,-50.69947331186447,6.68662482317688,-10.614885050120336,10.643515603414492
glm-4-0520,10.308475767963037,6.3882655596434335,-10.880885683442303,10.661668812800999
nemotron-4-340b-instruct,-4.134899419998265,5.0731134075842705,-8.180409285566274,8.34320276033041
llama-3-70b-instruct,-4.6439571817601255,2.7178541782689076,-4.523760307032302,4.483079638329768
claude-3-sonnet-20240229,1.2967069564117706,2.806292446858987,-4.74515181197272,4.4415623696531785
reka-core-20240501,-6.210539404462785,3.1900008556120003,-5.8286128809227105,5.222045900945303
command-r-plus,-19.778161638540695,3.020587643484419,-4.848092055094758,5.051196344633347
gpt-4-0314,8.922017689753352,3.463544919685517,-5.605846598273455,5.5335278504979435
qwen-max-0428,3.1451122858712153,4.2848616603363014,-7.07521044260616,7.10700963161508
qwen2-72b-instruct,1.489429895464202,4.191989532990615,-6.920888800177481,7.192936492052315
claude-3-haiku-20240307,0.7165887550491302,2.8193262918663344,-4.598697051975831,4.698705995263171
gemma-2-9b-it,-10.464285615973887,7.837536243125975,-13.253440406755061,12.706174399104311
deepseek-coder-v2,37.106650724902494,6.270478510452699,-10.779870767360208,10.342746827231288
glm-4-0116,14.347490709060274,6.6517895479547855,-11.277709767289284,10.567739373865127
qwen1.5-110b-chat,3.813215099691302,4.013839215351226,-6.40817751389274,6.3314010752597305
gpt-4-0613,3.300091480169838,3.044442054207484,-4.957756299859078,4.837265131320451
reka-flash-preview-20240611,-13.976923715004533,5.173518541088467,-8.500667385708748,8.609308413681015
yi-1.5-34b-chat,2.465506856160369,4.477213406889142,-7.9684095843993745,7.016237149247559
reka-flash-21b-20240226-online,-12.329321020539174,5.202654659488848,-8.78656355494964,8.478459152963255
mistral-large-2402,9.854330889780515,3.261281532066255,-5.2891021084232595,5.522929070877938
llama-3-8b-instruct,-16.495470617619656,2.8455336729614995,-4.582549293717399,4.5523048707859
qwen1.5-72b-chat,-3.5552951197332403,3.479461949827766,-5.932703094115732,5.811048893405995
claude-1,-24.643723878143543,4.966283299961936,-8.48556408785857,7.89931924705559
command-r,-30.14790356652458,3.3284403569900247,-5.550480688449717,5.453730829183389
reka-flash-21b-20240226,-13.42879173453978,4.056540203390016,-6.31135137015324,7.066774117646249
mistral-medium,-1.4961496028706185,3.759656038303392,-6.307918547745645,5.989543569229713
mixtral-8x22b-instruct-v0.1,3.003857205841476,3.65662477497913,-6.1702490778247885,5.939916230316012
gemini-pro-dev-api,-27.21928820526664,5.340387259616339,-8.862973145310729,8.783172042005159
claude-2.0,-5.495753456983672,6.297740043884913,-10.108822768346778,10.639769313388221
qwen1.5-32b-chat,3.9153075580141077,4.42932186834255,-7.386561296103485,7.546088330700881
zephyr-orpo-141b-A35b-v0.1,-11.364410060499468,8.726148825067456,-15.068079073268501,14.494746653365672
mistral-next,2.8337589363778464,5.780240982468896,-9.07461655952622,9.81064760882188
phi-3-medium-4k-instruct,9.222607995761173,5.252824694024856,-8.989704066188725,8.790977950446536
gpt-3.5-turbo-0613,4.067086940287711,4.087847167582952,-6.56879276194794,6.837380204599085
qwen1.5-14b-chat,1.0571499574089365,4.684214888431359,-7.368020680390911,7.370219338427436
starling-lm-7b-beta,-0.9700402607342414,4.8857778387088935,-8.052581102210716,7.830074904584378
claude-2.1,1.8949230822276935,3.895600379398558,-6.482173622322341,6.411138040903698
yi-34b-chat,-12.404029935720034,5.429929360960396,-9.160012882464924,8.767637408142908
gemini-pro,-29.74005676630859,8.427095165125131,-13.482927670756045,14.40188265818832
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,1.1096695001119377,3.1068182944095137,-5.053080431289871,5.384543982744487
claude-instant-1,-3.687627199487053,5.27294926683671,-8.74813134128486,8.722250223095012
wizardlm-70b,-31.200708485683304,7.742417903956796,-12.192779286369355,12.889040923957719
gpt-3.5-turbo-0314,8.539264701938023,11.738847860087196,-20.61154209026951,18.35138380465021
dbrx-instruct-preview,1.9514826349951744,3.7658199548336815,-6.369127318769775,6.201842769604029
phi-3-small-8k-instruct,2.7654890654191546,5.079748266851541,-8.14802823439292,8.56237699283609
tulu-2-dpo-70b,-4.6655191121155415,8.2962710041055,-13.9181202810431,14.320118412437628
snowflake-arctic-instruct,-22.729373739638977,3.8090614854465654,-6.1197848070402685,6.218222342370797
openchat-3.5-0106,-12.63977192966394,5.362646825028682,-8.480561792046402,8.718586238566534
llama-2-70b-chat,-24.628664367079697,3.6320588219421714,-5.78897306421387,6.02319106247646
vicuna-33b,-27.358117601909782,4.852337865321317,-8.205219959825609,7.808244472828832
starling-lm-7b-alpha,-17.449775262506584,6.377275835611857,-10.551619945996837,10.762548370696148
gemma-1.1-7b-it,-12.600576778039922,4.363294870508746,-7.148382690090615,7.241921013911256
nous-hermes-2-mixtral-8x7b-dpo,-35.16483185864716,9.558025948460886,-15.538357099542662,15.844802520261446
llama2-70b-steerlm-chat,-35.21039663489463,11.72401432147229,-20.041583943967417,19.93364169748412
openchat-3.5,-13.721319911004873,7.63271254184347,-12.687964466475336,12.08158557369432
deepseek-llm-67b-chat,-14.472563107973151,9.351240566446897,-15.532320783930174,15.263152540675861
openhermes-2.5-mistral-7b,-9.522645564392496,9.030256275792825,-14.828840591629753,15.082407402065792
qwen1.5-7b-chat,-10.095390519220294,8.472093582093782,-13.74234312436663,12.937451430815846
pplx-70b-online,-48.887942823849606,8.142723987799192,-13.695583545266977,13.396338418665003
mistral-7b-instruct-v0.2,-6.101156347704948,4.618310577885258,-7.366766062778302,7.911792971235271
gpt-3.5-turbo-1106,19.534101463241377,5.451808423862136,-9.277744839673623,8.995505650613639
phi-3-mini-4k-instruct,8.106424164231703,4.395991231381184,-7.132529716140686,7.100433064372873
llama-2-13b-chat,-18.59201557124556,5.217324302354688,-8.650213226992452,9.03991134413289
solar-10.7b-instruct-v1.0,-9.456985147701495,10.226893281737057,-16.653338460324743,16.966087691881
dolphin-2.2.1-mistral-7b,-14.580750195642777,15.985794004713073,-26.583435779086926,27.257058235254966
wizardlm-13b,-49.04002267469231,8.780573757346984,-14.262411967950108,15.036532752056146
zephyr-7b-beta,-29.09507249847589,6.9129809418037516,-11.511185842510322,11.916150962109402
phi-3-mini-128k-instruct,-15.269137829592468,4.618389967174797,-7.434531688636265,7.60601211012049
vicuna-13b,-23.19688656375569,5.609853695588578,-8.941363758216553,9.180927459844831
mpt-30b-chat,-2.1060358324404906,15.154318292642444,-24.98374525676321,25.08490084668454
codellama-34b-instruct,-19.37463069077232,8.198325596488814,-13.231059142815402,14.538029568808618
zephyr-7b-alpha,-26.74129942192442,17.38639201177951,-28.49116843963777,28.177556262817998
codellama-70b-instruct,2.7072814214307255,17.94765377031028,-28.616700659661593,29.902746801607982
pplx-7b-online,-27.653858167781063,8.879092066950834,-14.356645506193004,14.737147004399976
gemma-7b-it,3.781878582103125,6.9431399409067,-11.35848798373439,11.821885856089203
llama-2-7b-chat,-31.856673960249292,5.836173713370417,-9.223703041391222,9.81034028179268
qwen-14b-chat,-16.41722425199817,10.104521855804235,-16.810892618585335,16.277194971309527
falcon-180b-chat,-38.0302946518415,20.032514553683367,-31.7270089173169,32.24858369000742
guanaco-33b,-35.251664705204654,14.137455623012787,-22.97097734250482,23.57399419216994
gemma-1.1-2b-it,-15.198180051196578,6.157306005690154,-10.565340415930024,10.237833746384355
stripedhyena-nous-7b,-26.44971363526139,9.149172339414513,-14.2747421459983,14.531620871797045
olmo-7b-instruct,-25.748254836835645,8.13953363250336,-13.193031730441783,12.480215537673875
mistral-7b-instruct,-10.402854073498007,7.311303900352771,-12.58842206974493,11.59942838188987
palm-2,-11.627431828075952,8.71451773793266,-13.908790328464308,14.372570330886532
vicuna-7b,-19.7592538866323,9.414725884197807,-15.27385571757964,15.550770214682458
qwen1.5-4b-chat,-21.331187119533944,7.133408833420317,-11.47402394593166,11.787657387717339
gemma-2b-it,-11.629118023959949,8.876788674979482,-14.910228335882838,14.839713920917342
koala-13b,-42.535925110097516,10.068069672576984,-16.357350548826336,16.77527354677224
chatglm3-6b,-6.932277322108497,10.431376753363168,-17.020948640170907,17.237196842272542
gpt4all-13b-snoozy,-5.767930574580644,17.257342750694423,-28.52660881593595,28.17546628396466
chatglm2-6b,-18.922784438739495,14.454114830338735,-23.147199989510632,25.10509113457646
mpt-7b-chat,-36.43036699685212,11.923378106577095,-18.459562793807358,20.642259880083266
RWKV-4-Raven-14B,-35.67704099430641,11.361205358921689,-18.214145233165766,18.216702895844897
alpaca-13b,-88.19505019659589,10.807234058764074,-17.684492314038096,17.971059965596154
oasst-pythia-12b,-18.202185536242656,10.498899626889667,-16.448098761931274,18.064994453625946
chatglm-6b,9.221812296702783,11.766796023942744,-19.400825021610178,19.027507575150683
fastchat-t5-3b,-66.48872947668114,11.806198083831282,-19.011277547173798,18.624326779417146
stablelm-tuned-alpha-7b,-16.90889392286182,14.018257641333804,-22.64291798519329,23.832687914796253
dolly-v2-12b,-26.98243503105639,14.166071730502333,-22.337169999905846,23.1749294500735
llama-13b,-78.56076474993702,17.123654203429133,-27.760508671862645,28.66042655660293
