model_a,mean,std,lower,upper
gpt-4o-2024-05-13,1282.936740715753,1.639868245261304,-2.8247562113645017,2.676456002035593
claude-3-5-sonnet-20240620,1267.376543227681,2.4734650853066635,-4.180948986272597,4.127821370028869
gemini-advanced-0514,1260.7899621287793,1.7809815686891284,-2.908752015758637,2.845708721554047
gemini-1.5-pro-api-0514,1258.6322707322236,1.7117659207945168,-2.8591641725920454,2.7135447623609252
gpt-4-turbo-2024-04-09,1251.9537010989818,1.4199864677069967,-2.323435665241277,2.3287788493546486
gpt-4-1106-preview,1248.2831258779079,1.3873693376377774,-2.3438995666829214,2.3137345260715847
gemini-1.5-pro-api-0409-preview,1246.8260277360087,1.5887560785490291,-2.628384075493841,2.609722518468061
claude-3-opus-20240229,1244.5956076172724,1.2706523497276587,-2.1458067877938447,2.0940845197537783
gpt-4-0125-preview,1242.7940183507392,1.4042162615549503,-2.3549375836676063,2.234302343582158
yi-large-preview,1233.0022663902184,1.6867421674392713,-2.764870182804543,2.8342172527854927
gemini-1.5-flash-api-0514,1225.5226647429677,1.7849133912395039,-2.9963653266368055,3.0151043450341604
yi-large,1214.7429722450572,2.8366658799102993,-4.475188941442866,4.399516237778471
gemma-2-27b-it,1210.4110864951615,3.633826621816487,-6.026453612714249,5.93936184751783
bard-jan-24-gemini-pro,1207.443942482561,2.852256832284887,-4.533481880177533,4.686289687801263
glm-4-0520,1205.7141307428967,3.1448818301901036,-5.105592403237324,5.022914332751043
nemotron-4-340b-instruct,1204.2203688485044,2.325241478681064,-3.9097572085006504,3.6134785397459837
llama-3-70b-instruct,1201.9697286617518,1.2806673144537926,-2.1048338604323362,2.1281481124910897
claude-3-sonnet-20240229,1197.8317171360761,1.2991988718405685,-2.16145190542602,2.150399777842722
reka-core-20240501,1194.4810167175326,1.449244863297486,-2.3973414415993375,2.4356718103424555
command-r-plus,1188.1938782756831,1.397302444477048,-2.2722136788936496,2.2422375072430896
gpt-4-0314,1187.800784615765,1.570377462812477,-2.6091933590284953,2.5458496336455028
qwen-max-0428,1184.7249779372792,1.7871689359140333,-2.876684968957761,2.7379514433550867
qwen2-72b-instruct,1184.279427212423,1.9601609751522109,-3.191170378609968,3.2866743583006155
claude-3-haiku-20240307,1181.4756003147268,1.3634556333337322,-2.2333951076850553,2.2232029688477724
gemma-2-9b-it,1179.7696556907408,3.713478033638196,-5.9630760511813605,6.0396459786725245
deepseek-coder-v2,1176.876226361922,2.918747379792555,-4.870180202270149,4.595216797414878
glm-4-0116,1173.3158036719399,3.1802319141514896,-5.252686067388595,5.143892568838282
qwen1.5-110b-chat,1164.814391729758,1.892290836570534,-3.124663873594727,3.102356014684574
gpt-4-0613,1164.676319130593,1.425614132065951,-2.360779410789064,2.216733861080911
reka-flash-preview-20240611,1164.3837377536368,2.5761634358378798,-4.088849088017014,4.273503187913775
yi-1.5-34b-chat,1158.028100227211,2.113038585043578,-3.4023002533049294,3.3435121337843157
reka-flash-21b-20240226-online,1154.9350263447081,2.226308518391131,-3.651935813054706,3.6403786319731353
mistral-large-2402,1154.6016715936823,1.5152504438122762,-2.3815957427518697,2.5138512745068056
llama-3-8b-instruct,1153.5962326619933,1.37441684844282,-2.3371566019359307,2.1642649781415457
qwen1.5-72b-chat,1151.2003725798643,1.5843922096062595,-2.6169715133885347,2.4204035193642994
claude-1,1150.3361229647746,2.2959240303606,-3.752813110590523,3.7248951805065644
command-r,1149.4936261305595,1.5068622927717248,-2.5499752764128516,2.4666467608160474
reka-flash-21b-20240226,1146.5380206179457,1.9495357607446047,-3.3079030081546534,3.1069493298739417
mistral-medium,1145.9920391877438,1.7407603679626826,-2.824919345592434,2.882459374409791
mixtral-8x22b-instruct-v0.1,1145.6131100908262,1.6479153052961877,-2.685728848057124,2.74420711052926
gemini-pro-dev-api,1135.5527538669842,2.3159283018706036,-3.743552982984511,3.556425191997505
claude-2.0,1132.5616645464543,2.809721928331314,-4.51185005156708,4.7474450142510705
qwen1.5-32b-chat,1131.9975165923997,1.9903973115934654,-3.37213486975088,3.3390757281936203
zephyr-orpo-141b-A35b-v0.1,1127.8504078923347,3.8207106190114897,-5.97517331498716,6.333181789346554
mistral-next,1127.1997640261695,2.7050986147594718,-4.50230320134574,4.541153464772378
phi-3-medium-4k-instruct,1124.8135310948883,2.4924089211952323,-4.084978986117221,4.118166474148893
gpt-3.5-turbo-0613,1120.4211983144323,1.7619559958593196,-2.947451565785059,2.874383245345598
qwen1.5-14b-chat,1119.1304693170805,2.095647425718263,-3.6518137617440516,3.4250202545840693
starling-lm-7b-beta,1118.731227392609,2.300894166508162,-4.048850237599254,3.703633671700345
claude-2.1,1118.0853113140408,1.796273650745795,-2.91556126971318,2.9709197487841266
yi-34b-chat,1116.0750132210237,2.402435867101414,-3.801629059970992,3.850665909520785
gemini-pro,1115.2050234160918,3.7668447879614773,-6.113069882132322,6.165862204061796
mixtral-8x7b-instruct-v0.1,1114.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,1112.6395661073782,1.4132034747481377,-2.227533461603116,2.363364733395656
claude-instant-1,1111.6715982852104,2.4094348843365943,-3.9375211235717416,3.8951316622440117
wizardlm-70b,1110.2573404871953,3.3351093152666884,-5.608272282473763,5.447281026921928
gpt-3.5-turbo-0314,1109.9874096086812,4.516102863810337,-7.191138851624373,7.336488574902887
dbrx-instruct-preview,1105.6820445447224,1.785809125314296,-3.061731972093412,2.846578496042639
phi-3-small-8k-instruct,1104.1831302547407,2.3362229332164537,-3.7806809016228726,3.877671767523907
tulu-2-dpo-70b,1103.3694633303053,3.654650824739868,-6.072586546130879,6.193681541575643
snowflake-arctic-instruct,1098.9722601691396,1.7632814997855915,-2.7734401051516215,2.8512665483683577
openchat-3.5-0106,1098.7931217509642,2.5254219379524776,-4.083936343045934,4.202785372604922
llama-2-70b-chat,1096.5452425915717,1.7059009259948161,-2.791620564000368,2.7367042221314932
vicuna-33b,1095.0402242180146,2.1491353381761913,-3.412060727179778,3.624790127890492
starling-lm-7b-alpha,1092.553701263589,2.9011898067713364,-4.673887975946855,4.85712161493143
gemma-1.1-7b-it,1089.640350545829,1.9970023372063668,-3.2834325725286817,3.279497263848725
nous-hermes-2-mixtral-8x7b-dpo,1087.4871258678954,4.396416938667882,-6.842639226305209,7.617106037371741
llama2-70b-steerlm-chat,1083.3569555463368,5.01890857263329,-8.458785660714966,7.947270125498335
openchat-3.5,1080.404046803922,3.2944081326842043,-5.305884392687858,5.441797513565007
deepseek-llm-67b-chat,1079.912318473453,4.22566935429663,-7.040225140530993,6.982006845869819
openhermes-2.5-mistral-7b,1080.0875255798078,3.959071264123761,-6.334248931569618,6.4269604572971275
qwen1.5-7b-chat,1079.4057234782506,3.851220396107146,-6.3023434578312845,6.260830182171503
pplx-70b-online,1077.1379158646646,3.6809637130498567,-6.083439026861015,5.9207709400238855
mistral-7b-instruct-v0.2,1074.9372814232497,2.1440300459359785,-3.5477569570273317,3.6972360955903696
gpt-3.5-turbo-1106,1073.0409078978776,2.3756523944070755,-3.7598547059369594,3.941930811858356
phi-3-mini-4k-instruct,1071.9522457813252,2.215824532901279,-3.6851240399275866,3.7208828577306576
llama-2-13b-chat,1068.4111563700228,2.3744991022028685,-3.849039425243973,3.984366924278447
solar-10.7b-instruct-v1.0,1067.0695659214105,4.394095351757111,-7.175023612616997,7.220063833370887
dolphin-2.2.1-mistral-7b,1066.2323880298236,6.862478445939082,-11.33611284191943,11.446765855663216
wizardlm-13b,1063.3308604906217,3.5368471290170027,-5.5557569498312205,5.985456430549675
zephyr-7b-beta,1056.7748860440697,3.0430637805127225,-4.978300434298035,5.213556383044306
phi-3-mini-128k-instruct,1053.9006048848378,2.0844777876456244,-3.354675106494369,3.5754992366596525
vicuna-13b,1050.0987276290805,2.4480281790815326,-3.9481630795264664,4.169855102633619
mpt-30b-chat,1049.9995968909748,5.582415416908434,-9.36771631469378,9.394666384565426
codellama-34b-instruct,1048.7148453412399,3.4845372758753634,-5.732990956924141,5.966693995885635
zephyr-7b-alpha,1047.993613370296,6.638233083244357,-10.944025288644298,10.748926993859186
codellama-70b-instruct,1047.0990523624973,8.322778600979559,-13.488747410899578,14.592511200007266
pplx-7b-online,1045.0064754332257,3.9414588171757194,-6.648644795728387,6.365918369201836
gemma-7b-it,1043.2594786510874,3.1156131964941682,-5.167213754712975,5.294366326565296
llama-2-7b-chat,1042.8615474573858,2.477805485586332,-4.136854730434152,4.065483337952855
qwen-14b-chat,1041.2544304117662,4.045148287573047,-6.42436451708727,6.81518853297348
falcon-180b-chat,1039.2232579110305,8.37197732446507,-13.25873140487829,14.127109474839699
guanaco-33b,1036.7286185700807,5.443803796919947,-9.36755714974015,9.052620361416075
gemma-1.1-2b-it,1033.7754221449654,2.8614136032096957,-4.744090159853158,4.5361082054910185
stripedhyena-nous-7b,1023.9139129197557,4.077834331504699,-6.750314593322969,6.826056368540208
olmo-7b-instruct,1021.4492451897717,3.779865239640601,-6.03977234869501,6.140657087917248
mistral-7b-instruct,1016.1063920229136,3.257678447969437,-5.3094265515209145,5.607835730846091
palm-2,1012.2339927701535,3.5732781583669855,-5.921488156129271,5.708077650206519
vicuna-7b,1011.7735109038476,3.707614957529927,-6.225077328001021,6.17796312054179
qwen1.5-4b-chat,1003.4816988032351,3.387622810642563,-5.2987235851107926,5.540687306776363
gemma-2b-it,999.6104430853794,4.135967900643576,-6.8454875026002355,6.737346871039449
koala-13b,970.7301138540923,4.046975879621998,-6.572451427852798,6.509583045363911
chatglm3-6b,962.1611864948823,4.639481460152355,-7.709930273459577,7.916371013775688
gpt4all-13b-snoozy,940.8496780421708,7.182111286977945,-12.199337163786709,11.602166003170055
chatglm2-6b,935.8388561096964,5.974091715030079,-9.818985771411235,9.8807581236631
mpt-7b-chat,935.170537718564,4.972829313280843,-8.233129594925344,8.029082949808526
RWKV-4-Raven-14B,928.6797256031716,4.65381296628111,-7.733951718343178,7.993435226871952
alpaca-13b,910.7080513383291,4.24708644359062,-7.234827155781659,7.03987507791976
oasst-pythia-12b,901.957393433464,4.217644987811717,-6.811342356693103,7.042327224131668
chatglm-6b,888.5638916457731,4.811035764861355,-7.500572995724838,7.934049268774288
fastchat-t5-3b,878.7856924294904,4.873223161995899,-7.9097049356373645,7.888829412395808
stablelm-tuned-alpha-7b,850.9013438353705,5.6722506209239185,-9.180239931333745,9.656865155001356
dolly-v2-12b,828.0219921855528,5.535935543607996,-9.325411994864453,9.20974631752108
llama-13b,805.5447483335744,6.565340331302003,-10.598125426031629,10.230944147374771
