model_a,mean,std,lower,upper
gpt-4o-2024-05-13,12.854462334181255,3.9984723392992376,-6.736004614850172,6.304092216958815
claude-3-5-sonnet-20240620,34.550622971105724,5.756991880639499,-9.966259260244655,9.09956434053845
gemini-advanced-0514,-4.262415055145579,4.403050412132329,-7.4825319035434905,6.9628728266507265
gemini-1.5-pro-api-0514,8.474049086940889,4.2025334021860825,-7.00152213156478,6.829506981223486
gpt-4-turbo-2024-04-09,14.003825378171634,3.6207567621750294,-5.847267823852267,6.029515805650978
gpt-4-1106-preview,8.181070573245174,3.6338122250172082,-5.859288202355071,5.566782853594086
gemini-1.5-pro-api-0409-preview,-14.622729444728453,3.9495779829887163,-5.9500311331450195,6.320431041531638
claude-3-opus-20240229,6.399193940082416,3.2241571201753865,-5.336531074202542,5.05024750484767
gpt-4-0125-preview,3.025733852759713,3.655307053256293,-6.184543869864655,5.985864161579938
yi-large-preview,11.092786730258387,4.258995339073142,-7.092642824332698,6.701506189542658
gemini-1.5-flash-api-0514,9.16671651010516,4.458488552528418,-7.3490090446460385,7.3321947236419955
yi-large,18.30934798378869,7.2692709890098115,-11.863896207571809,11.869612190389876
gemma-2-27b-it,-4.556979678840363,8.825801718230105,-14.039510296668066,14.631487107312992
bard-jan-24-gemini-pro,-34.77110708574606,8.055902736269308,-13.280202094851568,13.105238572725632
glm-4-0520,13.224886867488294,7.680074629917685,-12.373544487101515,12.725491937133143
nemotron-4-340b-instruct,-7.444837754560598,6.32553752143767,-9.943028775816313,11.200637134956143
llama-3-70b-instruct,-0.2811463939850319,3.2289086774312437,-5.266626122568683,5.342486095409336
claude-3-sonnet-20240229,16.703204253376057,3.2945307232055407,-5.0990683884569385,5.390900046529239
reka-core-20240501,-4.283166716037363,3.74005714267475,-5.901493367794997,6.21558032543919
command-r-plus,-21.36078785216599,3.5731584654665967,-5.865041307785676,5.854596377728946
gpt-4-0314,9.664100495947583,4.0837897248799875,-6.737917204177311,6.49508680531651
qwen-max-0428,7.319892840881121,4.853853993386957,-8.349555110429387,7.793926453408144
qwen2-72b-instruct,-1.3211692919199913,5.074150354837589,-8.583758376195512,8.583856090069103
claude-3-haiku-20240307,9.971458951563283,3.4392073793926308,-5.6877755812869175,5.649177930600342
gemma-2-9b-it,-16.76958347405778,8.95813809431431,-15.076695870928404,14.76247935964976
deepseek-coder-v2,61.0518360514701,7.033247220853085,-11.794623465728087,11.406435897981773
glm-4-0116,19.09574273094529,7.89958079165487,-13.027890515657724,12.677942757934225
qwen1.5-110b-chat,10.126367263689259,4.831712617194041,-7.890041693518013,8.059565928296832
gpt-4-0613,2.549824900993458,3.665862438667152,-5.998112872697371,5.747437234260867
reka-flash-preview-20240611,-2.6757087745675445,6.182631193866894,-9.824450056341242,10.052551955732065
yi-1.5-34b-chat,5.687978309651691,5.453261732417151,-8.963428707111632,8.699208124051097
reka-flash-21b-20240226-online,-5.6377238648225045,5.383012251440214,-8.937445393824001,8.350641883543783
mistral-large-2402,13.558196032656435,3.8758401813298553,-6.215677533247712,6.22891019925242
llama-3-8b-instruct,-6.433567247998861,3.487289329932605,-5.500124896246887,5.782249583926852
qwen1.5-72b-chat,10.991397118196952,4.2705592956566765,-7.428440403599987,6.772571365198556
claude-1,-12.121625090557968,6.425183536113235,-10.377149193329322,10.483963708702452
command-r,-23.184853769098872,3.863515829392457,-6.001056423717888,6.3926188167685325
reka-flash-21b-20240226,-3.7173823673276996,4.639313439684792,-8.066381622454381,7.279259167254528
mistral-medium,8.086446428842995,4.571095298724136,-7.183468766715771,7.578001334793734
mixtral-8x22b-instruct-v0.1,9.040102827581395,4.104958147835115,-6.853221720202484,6.417264929870921
gemini-pro-dev-api,-30.819601371533196,6.098960178188709,-9.98132478978387,9.971744028617543
claude-2.0,3.5458263522847155,8.046128775147167,-13.073485768116447,13.481149627172048
qwen1.5-32b-chat,15.915219017934575,5.183598567568862,-8.707914259246774,8.634802794338562
zephyr-orpo-141b-A35b-v0.1,-4.971161310962141,10.36610956398331,-17.189944785470043,16.60130198746664
mistral-next,6.608344159118293,7.234402374519132,-11.759375978997916,12.077701120790188
phi-3-medium-4k-instruct,10.283256009287856,5.865575023483005,-9.192619035639781,9.919229837275473
gpt-3.5-turbo-0613,15.948573280019664,5.216846417511325,-8.332342973208783,8.570887772110975
qwen1.5-14b-chat,9.070388351704619,5.440529412612255,-9.099962981111638,8.801300025413276
starling-lm-7b-beta,12.52044556567551,5.6067281465775745,-9.185113217002067,9.078848270264274
claude-2.1,15.149130071404484,5.003188481686509,-8.74012424593863,7.96502912972057
yi-34b-chat,-9.446836815504561,6.4276372040660545,-10.742484637778329,10.472714911841464
gemini-pro,-19.319245481317864,11.077736079356942,-19.300990597695407,17.259554154007574
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,12.27066776420976,3.8901625617296336,-6.488126856273988,6.054635544397628
claude-instant-1,-1.6560049981860536,6.760876479241301,-10.817679845450751,11.345118151144263
wizardlm-70b,-37.792680917385546,9.6659996818412,-15.30490877008289,15.650373269514589
gpt-3.5-turbo-0314,8.269934969346721,14.526142679109167,-24.104218552103127,24.33647193479702
dbrx-instruct-preview,15.072460853580703,4.554670218582806,-7.408159608422187,7.355831550993372
phi-3-small-8k-instruct,3.4406351006583553,5.824560974690856,-10.007259032633433,9.427435794093444
tulu-2-dpo-70b,-8.070007672343515,10.64810845900276,-17.96708480236296,17.03258700987628
snowflake-arctic-instruct,-19.29608469101652,4.664084190963818,-8.121742354531923,7.310896223255163
openchat-3.5-0106,6.221540898745629,6.939033918883126,-11.386279566533364,11.51699106267455
llama-2-70b-chat,-21.714286114658094,4.715822829768459,-7.6582076764621,7.722133077751401
vicuna-33b,-28.07481736337878,6.060349804860291,-9.678616886914767,9.784243409712257
starling-lm-7b-alpha,-9.901224998776636,7.879459977660865,-12.932157532759566,12.573314784396418
gemma-1.1-7b-it,-2.9983883157708444,4.915598542832484,-7.96389803808187,7.781992727114593
nous-hermes-2-mixtral-8x7b-dpo,-4.401812525783892,11.867831188723414,-19.89513704997549,18.743305615415387
llama2-70b-steerlm-chat,-58.73773393130625,14.20439049611054,-22.979937225308355,23.15720701247821
openchat-3.5,-24.400882866178996,9.726223909825812,-16.304565620995056,15.415809739051475
deepseek-llm-67b-chat,1.977377195821619,11.953224283913157,-19.67496276585468,20.105240772550694
openhermes-2.5-mistral-7b,-18.29606302584907,11.917867406439418,-19.88484640416742,20.163471878732825
qwen1.5-7b-chat,12.922069277874144,10.366577030936993,-17.733450054355533,17.95390255460476
pplx-70b-online,-49.26802262237837,9.837583465076678,-15.54832508323841,16.01164062541602
mistral-7b-instruct-v0.2,-0.0611733292087456,5.744531282329603,-9.60064338248414,9.202676906581702
gpt-3.5-turbo-1106,24.6830033540251,6.4097244933055455,-10.267587941096348,10.780029389568135
phi-3-mini-4k-instruct,15.556448728816257,5.322730255265577,-8.691733474458612,8.447361001108275
llama-2-13b-chat,-15.9313555877656,6.481133762340321,-10.754144365015964,10.492473864360507
solar-10.7b-instruct-v1.0,-15.659253859441156,12.796008399384386,-21.134897085068555,19.83253872256884
dolphin-2.2.1-mistral-7b,-40.24422501837593,20.857818128220394,-34.58482604150667,34.22470995488467
wizardlm-13b,-36.997884276893075,10.934414757507495,-17.323073590532964,18.771697469668922
zephyr-7b-beta,-23.637341937331254,8.65079627609262,-13.764378450496082,14.165590646446962
phi-3-mini-128k-instruct,-22.893535979065582,5.400990823936156,-9.228637566714582,8.738651491823585
vicuna-13b,-14.655259124728236,7.244554836348809,-11.779297057389469,11.62308472920261
mpt-30b-chat,-20.784389050792278,17.706352500098177,-30.336494947543976,28.46592313614635
codellama-34b-instruct,-7.814789308204534,10.370898844050666,-17.214528154341778,16.86512641470565
zephyr-7b-alpha,-13.655841996309327,20.331732537513396,-32.82999401453305,31.87380720526724
codellama-70b-instruct,1.5271999093271367,21.93759200037184,-36.910435525969504,34.76083562811272
pplx-7b-online,-29.801021422446954,10.608161330595845,-17.08422708547499,17.825229913217374
gemma-7b-it,7.340215682675202,7.983826826111171,-12.81184851151479,13.671035222675162
llama-2-7b-chat,-37.623438399261296,7.042062290850624,-11.685064932992777,11.328829654019316
qwen-14b-chat,16.0378474691285,12.31498770487292,-20.49409507743632,19.580847416583474
falcon-180b-chat,-22.271358125219784,25.45951755718646,-43.48988246684141,41.99980777196004
guanaco-33b,-68.82501469164208,16.873216349538442,-27.892582820240968,28.315221267765068
gemma-1.1-2b-it,7.671195640681792,6.899103299864333,-11.121510038972843,11.462334667670728
stripedhyena-nous-7b,-20.75109494031673,11.510607365441137,-17.90395967972353,18.801190964957403
olmo-7b-instruct,-5.004884320341899,10.408190353664938,-16.312819759782585,17.280865468942167
mistral-7b-instruct,-4.756695799136374,9.159144477536843,-14.488155560865273,15.260345717670603
palm-2,-21.7181532160186,10.357515474009249,-16.573132434506864,16.47534360030459
vicuna-7b,-26.996216911653107,11.001019004852969,-17.32975352319758,17.80681754980928
qwen1.5-4b-chat,-11.157275328383795,8.798197955532386,-14.265937579977482,15.55451290546578
gemma-2b-it,-0.22177667848215105,11.0485101736328,-18.415195538046202,18.192769784813276
koala-13b,-30.079962183431476,11.801137914794353,-18.893904941853204,18.673800891404568
chatglm3-6b,-8.546317556894397,13.626120735822484,-22.268276818647834,22.55770322703538
gpt4all-13b-snoozy,-27.819354654710537,22.028803623415463,-37.52443885427265,35.340337755071346
chatglm2-6b,-42.71529902328109,18.30628290067514,-30.479688090179295,29.900237272724482
mpt-7b-chat,-28.10046124127222,15.997392261152145,-24.663461908613204,26.119779115039584
RWKV-4-Raven-14B,-29.782996154508247,13.93043642763346,-22.16691996877339,23.674910797727897
alpaca-13b,-117.48908238153192,14.442632094473655,-24.013365431124882,22.747552534655654
oasst-pythia-12b,-26.333905890179135,12.86171652164154,-21.613829310195115,22.40691106505702
chatglm-6b,0.7939009049777254,13.658224863415693,-23.328103866335926,22.486690144458947
fastchat-t5-3b,-116.82671511786049,15.471685309555443,-25.154790052015215,25.775404771875316
stablelm-tuned-alpha-7b,9.866354593750287,16.901025486775506,-26.26722335734786,27.30666865865004
dolly-v2-12b,-77.83527755473712,17.013500625125296,-28.533326424351678,27.31756387741399
llama-13b,-134.0792432720232,21.871647400306,-35.56043181733955,35.411772153987386
