,mean,std,lower,upper
gpt-4o-2024-05-13,52.36482318975345,5.304480001326497,-9.156400610396851,8.966019411845352
claude-3-5-sonnet-20240620,44.77629595050014,7.361700297387036,-12.07207531850355,12.085903441614313
gemini-advanced-0514,68.81552472328393,5.512025762677822,-8.930574994398683,8.718851584342644
gemini-1.5-pro-api-0514,75.53819207366465,5.472735911934032,-8.682142951930757,8.497099348461234
gpt-4-turbo-2024-04-09,50.28182105850209,4.373432688856922,-7.057644088586116,7.39855205022873
gpt-4-1106-preview,52.46810748594726,4.756915311053703,-7.896576188875088,7.457897767693417
gemini-1.5-pro-api-0409-preview,54.586004975474125,5.142521441048933,-8.515661266960258,8.335473153811499
claude-3-opus-20240229,70.3477685348971,3.9854396481426084,-6.570529482478378,6.594600726820232
gpt-4-0125-preview,52.6170469474542,4.467865967031361,-7.437362127391118,7.247565086883043
yi-large-preview,83.84828730207194,5.497738739394499,-8.79987973068097,8.858156089195617
gemini-1.5-flash-api-0514,66.44551218401934,5.621662666405488,-9.300860461464097,9.42033455589847
yi-large,57.72522854515232,9.343649980610696,-15.112539406903068,15.305436459151437
gemma-2-27b-it,54.18240172754802,11.962760466853537,-20.076892442784008,20.12118038264392
bard-jan-24-gemini-pro,58.573016386379464,14.374182502676232,-22.500560195691847,24.09978752192903
glm-4-0520,89.73822919996593,10.065654802089817,-15.97479421409139,17.219888608222035
nemotron-4-340b-instruct,62.121462296743324,7.491683509472529,-12.435135886371562,12.342341645044655
llama-3-70b-instruct,-32.238323216730755,4.078884980193494,-6.583401484969016,6.532493643740388
claude-3-sonnet-20240229,48.48294765312604,3.800761706233707,-6.606234685125152,6.124671519408707
reka-core-20240501,62.79253795232905,5.09802487953613,-8.478256693934334,8.34849095595991
command-r-plus,63.115757880658414,4.2728821085139055,-7.0299778803523125,6.919342776564882
gpt-4-0314,53.8146303455407,5.032457530191074,-8.316853920237278,8.327684128704227
qwen-max-0428,105.41469708176533,6.924846449205744,-11.415915907863294,11.2546114087327
qwen2-72b-instruct,106.30477150367413,6.356510475348448,-10.378025094726837,10.55233910081536
claude-3-haiku-20240307,32.56576237816077,4.106514881748067,-7.049794121231528,6.435950506755823
gemma-2-9b-it,56.49838196214973,11.669421183013565,-19.4790761550314,19.886309675532964
deepseek-coder-v2,80.11582453737586,9.6488560167526,-16.21185852665181,16.185481890671035
glm-4-0116,99.17283779102002,11.076085412233574,-17.887350303559614,18.531494957801684
qwen1.5-110b-chat,95.89796666962164,6.23642731255641,-10.130241412733696,10.376136368737306
gpt-4-0613,24.93309252029674,4.597845408762561,-7.796121122897176,7.575705868169827
reka-flash-preview-20240611,52.17366524654146,8.076048526510348,-13.461705517552545,13.397565592209396
yi-1.5-34b-chat,109.66511016191068,6.692716849446095,-11.130965924693555,11.024825657957948
reka-flash-21b-20240226-online,46.34657633697005,6.736306520128684,-10.907933437814343,11.008148428543741
mistral-large-2402,18.22066182266825,4.708449228480066,-7.832938093783159,7.917093932144338
llama-3-8b-instruct,-14.78793965046902,4.1481864533809185,-6.786241211935732,6.7042715110537685
qwen1.5-72b-chat,93.94329562981247,5.496851062917844,-9.079118660421997,9.021997799226313
claude-1,48.58906995567272,9.154442608503732,-13.732344430402243,14.390252113705117
command-r,66.03070784307158,4.558615762466837,-7.510241650832512,7.20040414562466
reka-flash-21b-20240226,45.63681557844988,6.17416791836779,-9.935494329101829,10.20388960058414
mistral-medium,19.92126785022037,6.109454118125733,-10.36324961332799,9.948144120314279
mixtral-8x22b-instruct-v0.1,39.64543800398654,5.324361120429049,-9.024047256768007,8.468248067935846
gemini-pro-dev-api,50.85236204169756,7.71339442715468,-12.617221817795397,12.60770895803531
claude-2.0,57.61718551831238,13.080641474677408,-21.0360810853676,21.750371099785838
qwen1.5-32b-chat,104.12296291203758,5.90532741526735,-9.754941004453116,9.613957052133102
zephyr-orpo-141b-A35b-v0.1,21.9675618202127,11.247891226274612,-18.0853454537583,18.79723833753225
mistral-next,12.436049953631978,10.613858932052453,-17.757459169665104,17.189368886941423
phi-3-medium-4k-instruct,36.28605062764146,7.401091019775696,-11.932511082925274,12.157685818338386
gpt-3.5-turbo-0613,40.44738293650535,8.020901160577994,-13.26234606640297,13.503704604277008
qwen1.5-14b-chat,89.40694006425477,6.0039167564996925,-9.666130681784253,9.835205039546594
starling-lm-7b-beta,65.97192432600386,6.408640293669705,-10.287437639404615,10.509851282313619
claude-2.1,32.2875934819668,6.545340223070561,-11.808018566393422,10.890572449088527
yi-34b-chat,115.94136415654012,8.688411219362786,-14.278048754969376,14.653672419917143
gemini-pro,51.530946922327416,18.279542577266568,-30.155587897995304,29.42863468840844
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,29.252977872279864,4.654368246928559,-7.68814250413736,7.608893128294223
claude-instant-1,37.924195725415174,10.563076926445051,-17.17715827434141,17.474514444381608
wizardlm-70b,9.904362169312886,16.590081745929364,-27.668059048394344,27.48433483120411
gpt-3.5-turbo-0314,95.00046365915468,18.64297540166346,-30.52798443256829,30.38672690240317
dbrx-instruct-preview,29.820472533790255,5.186423703634628,-8.497759030011434,8.201789631590223
phi-3-small-8k-instruct,21.34473440756365,7.021681573121732,-11.591705193060214,11.654604013957115
tulu-2-dpo-70b,-33.35889698799889,18.046242550101795,-31.272568475056698,28.9718137496111
snowflake-arctic-instruct,53.47918245712529,5.710757111367204,-9.577308664625633,9.21315895732318
openchat-3.5-0106,48.03903230024873,8.209500475125207,-13.033185810739155,14.045570061699593
llama-2-70b-chat,-39.316830859476696,5.772471796403195,-9.393495656030748,9.464975899337013
vicuna-33b,8.99089508851647,8.173959606070358,-12.91227408966224,13.141381413002932
starling-lm-7b-alpha,19.652243593423282,10.542095324978114,-17.118824470371766,17.45801034945698
gemma-1.1-7b-it,38.80281115105974,6.11478058262024,-9.807618510460593,9.925804069843828
nous-hermes-2-mixtral-8x7b-dpo,-8.928438302868479,29.376062515885856,-48.181114552229424,47.56609001908169
llama2-70b-steerlm-chat,14.083371782621995,22.21043168293135,-35.50255367213896,37.54065891364423
openchat-3.5,53.07374177577738,16.259831059914305,-25.73447958318674,26.775525793340844
deepseek-llm-67b-chat,102.52279726940243,20.656368779757926,-33.827521716242885,34.76966178289889
openhermes-2.5-mistral-7b,7.801106955154917,20.3942867863992,-33.28116281300618,34.63532998171173
qwen1.5-7b-chat,122.09040847253392,13.530437447274556,-23.296647445335495,22.729145603963886
pplx-70b-online,54.365245781660946,18.14144619386035,-30.51164495674496,29.58020041141536
mistral-7b-instruct-v0.2,7.980407486050066,6.822812676797453,-10.727314945267842,11.241422358617747
gpt-3.5-turbo-1106,10.19884066260918,11.443924366395404,-19.02591594161656,17.77127859141064
phi-3-mini-4k-instruct,12.476479969188464,7.2468466485256995,-11.515312094173947,12.005946692447495
llama-2-13b-chat,-17.387232686037837,9.315344242151339,-15.425302250708883,15.495879324188504
solar-10.7b-instruct-v1.0,-0.17232453674906742,22.18500700771359,-35.91253139122369,37.189834782018124
dolphin-2.2.1-mistral-7b,43.6809902306606,33.3003079901357,-54.579224566303864,56.3148282136054
wizardlm-13b,27.839354020044247,15.975135433586651,-26.00133178502753,26.07237412827664
zephyr-7b-beta,-31.102895616381964,14.836295997664308,-24.3481178410958,24.863410341451136
phi-3-mini-128k-instruct,47.03171751564965,6.8572694427140295,-11.414430864558518,11.103722867710765
vicuna-13b,63.159866255948764,9.534386382594807,-16.31077159522146,15.512813121235197
mpt-30b-chat,-3.208115835043983,28.93586269985257,-48.26896702366569,48.36575634948945
codellama-34b-instruct,-17.00450867029602,18.15769464502145,-29.410330177821876,31.50048592844295
zephyr-7b-alpha,1.0464396014162003,40.90819256984696,-68.44726048238996,68.52963448970948
codellama-70b-instruct,62.011529592932945,23.738603131267606,-37.18463638476916,39.829817834702695
pplx-7b-online,46.94916832402216,19.81629933768965,-31.801021977039873,33.262623367788706
gemma-7b-it,64.16772837121131,9.60751118354195,-15.584773622031719,15.963837137182452
llama-2-7b-chat,-10.686846597030776,9.562575646015627,-15.602122101533062,15.890944135924709
qwen-14b-chat,93.80998609883484,20.991142202127826,-36.46213584265935,34.70946049790423
falcon-180b-chat,-5.762971177186068,53.199495557532245,-103.14212310813484,79.85071173969195
guanaco-33b,-8.095558096811738,23.291251321025424,-37.767262094248366,37.54456771245018
gemma-1.1-2b-it,54.351836877690474,8.772176474363011,-13.836270827059295,13.94469110320015
stripedhyena-nous-7b,10.956356908008365,22.361595695130198,-36.71636712563496,36.25185536174171
olmo-7b-instruct,61.99822403375198,11.423583902587449,-18.54927096038712,18.37430525151116
mistral-7b-instruct,-8.405131305168066,15.871633409071533,-25.709062724317974,26.14543485410957
palm-2,-68.93408787428977,18.572847840803924,-32.35878256341896,30.6187791527623
vicuna-7b,30.045545508445585,16.37847991613943,-27.842111308766732,25.628271513017424
qwen1.5-4b-chat,92.00588640332236,9.805815157939639,-16.462167248745246,15.782024018023506
gemma-2b-it,67.15661123218935,12.597417051208888,-21.289676198152733,20.79979340298408
koala-13b,-34.03798368035321,15.551025724962956,-25.665537331618488,25.86223948539562
chatglm3-6b,158.85782837529405,20.475298982753213,-35.23476398067194,33.060947842218724
gpt4all-13b-snoozy,-5.710223918691646,29.18391482544417,-44.916938861746196,48.80482007657421
chatglm2-6b,119.92299962149177,29.435220479713333,-46.265978055687995,47.27002945771095
mpt-7b-chat,73.27143681426085,19.88335871190633,-32.039127581561914,33.9682472516064
RWKV-4-Raven-14B,28.90187645642271,18.83649986632935,-30.944947820181266,31.58661996401892
alpaca-13b,-64.53512012375975,18.677575095823414,-30.913494526768808,30.449823278725134
oasst-pythia-12b,-40.61042806859528,16.4754880281189,-26.56931100196388,26.811384647235727
chatglm-6b,251.67440142358024,18.97142311707941,-30.33069802358395,30.029843230629808
fastchat-t5-3b,22.32561871133334,189.1574072063299,-250.30577509160247,172.3270492032984
stablelm-tuned-alpha-7b,52.1289833845646,20.744275350683154,-31.794270493619557,35.882492692698335
dolly-v2-12b,52.9955654773545,21.9963877916789,-35.18510777216174,35.245248900914824
llama-13b,47.32258220100872,21.866309045839,-36.18384042000984,35.67958287499361
