model_a,mean,std,lower,upper
gpt-4o-2024-05-13,1282.9367407157497,1.6398682452583133,-2.8247562113679123,2.6764560020303634
claude-3-5-sonnet-20240620,1267.3765432276837,2.473465085311591,-4.180948986271687,4.1278213700268225
gemini-advanced-0514,1260.7899621287745,1.7809815686845858,-2.9087520157634117,2.8457087215494994
gemini-1.5-pro-api-0514,1258.6322707322195,1.7117659207897276,-2.8591641726516173,2.7135447623568325
gpt-4-turbo-2024-04-09,1251.95370109898,1.4199864677071166,-2.323435665243096,2.3287788493757944
gpt-4-1106-preview,1248.2831258779042,1.387369337640162,-2.3438995671169778,2.3137345260679467
gemini-1.5-pro-api-0409-preview,1246.826027736009,1.5887560785518082,-2.6283840754933863,2.6097225184694253
claude-3-opus-20240229,1244.595607617277,1.2706523497220028,-2.1458067878327256,2.094084519759008
gpt-4-0125-preview,1242.7940183507274,1.4042162615454339,-2.3549375836903437,2.234302343568288
yi-large-preview,1233.0022663902141,1.6867421674260237,-2.764870182807954,2.8342172527804905
gemini-1.5-flash-api-0514,1225.5226647429638,1.7849133912350423,-2.996365326641353,3.0151043450287034
yi-large,1214.7429722450618,2.836665879910678,-4.475188941449005,4.399516236996533
gemma-2-27b-it,1210.4110864951613,3.6338266218194146,-6.026453612602154,5.939361847503051
bard-jan-24-gemini-pro,1207.4439424825584,2.8522568322765873,-4.533481880151612,4.686289687800809
glm-4-0520,1205.7141307428792,3.1448818301922365,-5.10559240325847,5.022914332735127
nemotron-4-340b-instruct,1204.2203688484897,2.3252414786486204,-3.909757208515657,3.6134785397330234
llama-3-70b-instruct,1201.9697286617409,1.280667314424143,-2.1048338604434775,2.128148112322606
claude-3-sonnet-20240229,1197.8317171360893,1.2991988718752354,-2.1614519057498,2.1503997778529538
reka-core-20240501,1194.481016717531,1.449244863295276,-2.397341441601384,2.435671810340409
command-r-plus,1188.193878275687,1.397302444470165,-2.272213678890239,2.242237507246955
gpt-4-0314,1187.8007846157657,1.5703774628113631,-2.6091933590312237,2.545849633554326
qwen-max-0428,1184.7249779372778,1.7871689359220415,-2.8766849689000082,2.737951443353495
qwen2-72b-instruct,1184.279427212407,1.9601609751433713,-3.1911703786261114,3.2866743582831077
claude-3-haiku-20240307,1181.4756003147083,1.3634556332888352,-2.233395106100488,2.223202968807527
gemma-2-9b-it,1179.7696556907342,3.7134780336056727,-5.963076051187272,6.039645978670478
deepseek-coder-v2,1176.8762263619215,2.918747379773031,-4.870180202270376,4.595216797416697
glm-4-0116,1173.315803671922,3.1802319141443056,-5.252686067394279,5.143892569028594
qwen1.5-110b-chat,1164.8143917297552,1.892290836574755,-3.1246638734207863,3.1023560146823
gpt-4-0613,1164.6763191306159,1.4256141320643168,-2.360779410765417,2.2167338611036485
reka-flash-preview-20240611,1164.3837377536324,2.576163435843766,-4.088849088013376,4.273503187904453
yi-1.5-34b-chat,1158.028100227242,2.113038585036735,-3.402300253328576,3.343512133755894
reka-flash-21b-20240226-online,1154.93502634473,2.22630851841793,-3.6519358130324235,3.6403786320231575
mistral-large-2402,1154.6016715936798,1.5152504438111383,-2.381595742754371,2.51385127450294
llama-3-8b-instruct,1153.5962326620063,1.3744168484358124,-2.3371566019627608,2.1642649781504133
qwen1.5-72b-chat,1151.2003725798634,1.5843922096048908,-2.616971513377848,2.4204035193642994
claude-1,1150.3361229648217,2.2959240303538038,-3.752813110543457,3.72489518035718
command-r,1149.4936261305595,1.5068622927719684,-2.5499752764117147,2.466646760802405
reka-flash-21b-20240226,1146.5380206179448,1.9495357607640404,-3.307903008150106,3.1069493287341174
mistral-medium,1145.9920391877451,1.7407603679613977,-2.8249193455949353,2.882459374369546
mixtral-8x22b-instruct-v0.1,1145.6131100908253,1.6479153052976028,-2.685728848058943,2.7442071105217565
gemini-pro-dev-api,1135.5527538669965,2.3159283018867907,-3.743552982954043,3.556425192010238
claude-2.0,1132.5616645464725,2.809721928346337,-4.511850051557076,4.7474450142626665
qwen1.5-32b-chat,1131.997516592394,1.990397311547345,-3.3721348694189146,3.339075728122225
zephyr-orpo-141b-A35b-v0.1,1127.8504078923165,3.82071061895504,-5.975173314130416,6.3331817893285915
mistral-next,1127.1997640261363,2.705098614719511,-4.502303201298673,4.541153464739409
phi-3-medium-4k-instruct,1124.8135310948608,2.4924089211756444,-4.084978986144733,4.118166474124337
gpt-3.5-turbo-0613,1120.4211983144423,1.7619559958548427,-2.9474515659023837,2.8743832453549203
qwen1.5-14b-chat,1119.1304693170907,2.095647425728856,-3.6518137617342745,3.4250202545895263
starling-lm-7b-beta,1118.7312273926152,2.300894166514345,-4.048850237642,3.7036336717151244
claude-2.1,1118.0853113140524,1.7962736507471184,-2.9155612697018114,2.9709197487927668
yi-34b-chat,1116.075013221038,2.402435867116308,-3.801629059958259,3.8506659095448867
gemini-pro,1115.2050234160995,3.7668447879640676,-6.113069882098671,6.16586220407612
mixtral-8x7b-instruct-v0.1,1114.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,1112.6395661073764,1.413203474742891,-2.227533461604935,2.3633647334070247
claude-instant-1,1111.6715982852252,2.4094348843445483,-3.937521123557417,3.8951316622619743
wizardlm-70b,1110.2573404872298,3.3351093152760836,-5.608272282381222,5.44728102695376
gpt-3.5-turbo-0314,1109.9874096087474,4.516102863787218,-7.191138851558435,7.336488575708472
dbrx-instruct-preview,1105.6820445447238,1.7858091253118307,-3.061731972067946,2.8465784960442306
phi-3-small-8k-instruct,1104.1831302547353,2.3362229332236883,-3.780680901627875,3.8776717671537426
tulu-2-dpo-70b,1103.3694633303076,3.654650824744023,-6.072586546125194,6.193681541519709
snowflake-arctic-instruct,1098.972260169142,1.7632814997861048,-2.7734401051479836,2.8512665483694946
openchat-3.5-0106,1098.7931217509317,2.5254219379329137,-4.083936343085952,4.202785373250663
llama-2-70b-chat,1096.5452425915748,1.7059009259965543,-2.7916205639921827,2.736704222135131
vicuna-33b,1095.0402242180296,2.1491353381729255,-3.412060727025846,3.624790127905271
starling-lm-7b-alpha,1092.5537012636032,2.9011898067778437,-4.67388797593253,4.857121614789776
gemma-1.1-7b-it,1089.6403505458256,1.9970023371984444,-3.2834325725323197,3.279497263481744
nous-hermes-2-mixtral-8x7b-dpo,1087.4871258679216,4.396416938681845,-6.8426392262815625,7.617106036428595
llama2-70b-steerlm-chat,1083.3569555463325,5.018908572619788,-8.458785659709747,7.947270125456271
openchat-3.5,1080.4040468039448,3.294408132668054,-5.305884392665121,5.4417975135600045
deepseek-llm-67b-chat,1079.9123184734685,4.225669354301901,-7.040225140612392,6.982006845889373
openhermes-2.5-mistral-7b,1080.0875255798514,3.9590712641095407,-6.334248931712864,6.426960457174346
qwen1.5-7b-chat,1079.4057234782233,3.851220396108864,-6.302343457854249,6.2608301820744146
pplx-70b-online,1077.1379158646703,3.680963713052681,-6.083439026854876,5.920770939834938
mistral-7b-instruct-v0.2,1074.9372814232415,2.1440300459346764,-3.54775695690887,3.6972360955628574
gpt-3.5-turbo-1106,1073.0409078978807,2.3756523944112105,-3.759854705941052,3.94193081185972
phi-3-mini-4k-instruct,1071.9522457813114,2.2158245328854136,-3.685124039941684,3.7208828577579425
llama-2-13b-chat,1068.4111563700233,2.374499102197869,-3.8490394252442,3.984366924283222
solar-10.7b-instruct-v1.0,1067.0695659214316,4.394095351734216,-7.175023612596078,7.220063833455924
dolphin-2.2.1-mistral-7b,1066.2323880298304,6.862478445920506,-11.336112841910335,11.446765855665944
wizardlm-13b,1063.3308604906417,3.536847129018848,-5.55575694981917,5.985456430436898
zephyr-7b-beta,1056.7748860440595,3.0430637805108436,-4.9783004343075845,5.21355638310547
phi-3-mini-128k-instruct,1053.9006048848587,2.0844777876502145,-3.3546751066514844,3.5754992368883904
vicuna-13b,1050.0987276290891,2.448028179082968,-3.9481630795187357,4.169855102642259
mpt-30b-chat,1049.999596891001,5.582415416893053,-9.367716314537574,9.394666384589527
codellama-34b-instruct,1048.7148453412399,3.484537275872998,-5.7329909563452475,5.966693995886317
zephyr-7b-alpha,1047.9936133703138,6.63823308323595,-10.944025288455805,10.748926993881696
codellama-70b-instruct,1047.0990523622547,8.32277860120698,-13.488747411150598,14.592511199765568
pplx-7b-online,1045.0064754332313,3.941458817155051,-6.648644795710879,6.365918369353267
gemma-7b-it,1043.259478651056,3.1156131964879594,-5.167213754744125,5.294366326313593
llama-2-7b-chat,1042.8615474573746,2.477805485588958,-4.136854730470304,4.065483337939895
qwen-14b-chat,1041.2544304117937,4.045148287584699,-6.424364517213007,6.815188532996217
falcon-180b-chat,1039.2232579109839,8.37197732448122,-13.258731404903301,14.127109472937263
guanaco-33b,1036.7286185701007,5.443803796923089,-9.367557149724462,9.052620361450408
gemma-1.1-2b-it,1033.775422144979,2.8614136032257376,-4.744090160111909,4.536108205513301
stripedhyena-nous-7b,1023.9139129197856,4.077834331502316,-6.750314593293297,6.826056368570221
olmo-7b-instruct,1021.4492451897586,3.7798652395297365,-6.039772348249471,6.140657087810382
mistral-7b-instruct,1016.1063920228866,3.2576784479951324,-5.309426551549677,5.607835730818465
palm-2,1012.2339927701527,3.5732781583666426,-5.9214881561302946,5.708077650205269
vicuna-7b,1011.7735109038595,3.7076149575185213,-6.2250773279898794,6.177963120471873
qwen1.5-4b-chat,1003.4816988032426,3.387622810682983,-5.298723585096468,5.540687306782388
gemma-2b-it,999.6104430854062,4.135967900598344,-6.8454875025737465,6.7373468711191435
koala-13b,970.7301138541073,4.046975879615772,-6.572451428239901,6.509583045378804
chatglm3-6b,962.1611864949023,4.639481460132981,-7.709930273284954,7.916371013795583
gpt4all-13b-snoozy,940.8496780421759,7.1821112869350685,-12.19933716377841,11.602166002911758
chatglm2-6b,935.8388561096888,5.974091715010632,-9.81898577138304,9.880758123705732
mpt-7b-chat,935.1705377185729,4.972829313288594,-8.233129594917045,8.029082949799431
RWKV-4-Raven-14B,928.6797256031647,4.653812966284424,-7.733951718350454,7.993435226864676
alpaca-13b,910.7080513383318,4.247086443594181,-7.234827155766084,7.039875077894521
oasst-pythia-12b,901.957393433473,4.217644987823909,-6.811342356684577,7.042327224140081
chatglm-6b,888.5638916457762,4.811035764850946,-7.500572995726088,7.934049268747799
fastchat-t5-3b,878.7856924295046,4.873223161984429,-7.909704935623154,7.888829412409905
stablelm-tuned-alpha-7b,850.901343835392,5.672250620957486,-9.18023993130123,9.65686515499874
dolly-v2-12b,828.021992185576,5.535935543606839,-9.325411994845695,9.209746317115787
llama-13b,805.544748333608,6.565340331254065,-10.598125426508773,10.230944147409218
