model_a,mean,std,lower,upper
gpt-4o-2024-05-13,12.854462334167081,3.9984723392763284,-6.736004614867088,6.304092216629035
claude-3-5-sonnet-20240620,34.55062297112889,5.756991880633065,-9.966259256947971,9.099564340578304
gemini-advanced-0514,-4.262415055179137,4.403050412092955,-7.482531903560519,6.962872826635335
gemini-1.5-pro-api-0514,8.474049086914842,4.202533402191324,-7.001522131605412,6.829506981171155
gpt-4-turbo-2024-04-09,14.003825378166159,3.6207567621556915,-5.84726782390193,6.029515805674652
gpt-4-1106-preview,8.181070573241012,3.633812225000567,-5.859288202625146,5.566782853597882
gemini-1.5-pro-api-0409-preview,-14.622729444729357,3.9495779829736612,-5.950031133145707,6.320431041523674
claude-3-opus-20240229,6.399193940199254,3.224157120243252,-5.336531074102881,5.05024750431256
gpt-4-0125-preview,3.0257338527594224,3.6553070532591088,-6.1845438698696515,5.985864161759512
yi-large-preview,11.092786730236591,4.2589953390643345,-7.09264282388199,6.701506189306153
gemini-1.5-flash-api-0514,9.16671651008174,4.458488552518492,-7.349009044652339,7.332194723530308
yi-large,18.30934798380287,7.269270989158531,-11.863896207487667,11.869612190395848
gemma-2-27b-it,-4.556979678816479,8.825801718228288,-14.03951029577956,14.631487107247121
bard-jan-24-gemini-pro,-34.77110708573314,8.055902736237408,-13.280202094919513,13.105238571781385
glm-4-0520,13.224886867463004,7.680074629984016,-12.37354448906836,12.725491937107739
nemotron-4-340b-instruct,-7.444837754638614,6.32553752145477,-9.943028771299858,11.20063713462829
llama-3-70b-instruct,-0.2811463939665846,3.2289086775488895,-5.266626122457888,5.34248609539038
claude-3-sonnet-20240229,16.703204253403264,3.294530723152646,-5.09906838874377,5.390900046216011
reka-core-20240501,-4.283166716046249,3.7400571426467306,-5.901493367799369,6.215580325454054
command-r-plus,-21.36078785218171,3.573158465434102,-5.865041307801974,5.854596377760444
gpt-4-0314,9.664100495943664,4.083789724869962,-6.737917204186893,6.495086805309102
qwen-max-0428,7.319892840923819,4.85385399337745,-8.349555110382687,7.793926448010847
qwen2-72b-instruct,-1.3211692919364006,5.074150354758878,-8.583758376399231,8.583856090100614
claude-3-haiku-20240307,9.971458951498732,3.4392073794719384,-5.687775582646907,5.649177930655367
gemma-2-9b-it,-16.769583474097153,8.958138094308195,-15.076695871040982,14.762479359559283
deepseek-coder-v2,61.051836051466246,7.033247220844433,-11.794623466760832,11.406435898038701
glm-4-0116,19.095742730934692,7.899580791737019,-13.027890515592023,12.677942758593028
qwen1.5-110b-chat,10.126367263722594,4.831712617166122,-7.890041693367719,8.059565927782595
gpt-4-0613,2.549824901010872,3.6658624386531766,-5.99811287267865,5.747437234252509
reka-flash-preview-20240611,-2.675708774536311,6.182631193884517,-9.824450055371479,10.052551955777556
yi-1.5-34b-chat,5.687978309655824,5.453261732530268,-8.963428707102214,8.699208124100613
reka-flash-21b-20240226-online,-5.637723864825122,5.383012251450794,-8.937445396279436,8.350641883367032
mistral-large-2402,13.558196032646054,3.87584018131798,-6.215677533252945,6.2289101992468705
llama-3-8b-instruct,-6.4335672480491874,3.487289329868323,-5.500124896310732,5.782249583903105
qwen1.5-72b-chat,10.991397118200814,4.270559295647946,-7.428440403607096,6.772571365202986
claude-1,-12.12162509052953,6.425183536114803,-10.377149194355875,10.483963709166332
command-r,-23.18485376910141,3.863515829391262,-6.00105642371981,6.392618816877263
reka-flash-21b-20240226,-3.717382367309723,4.639313439668266,-8.066381622322002,7.2792591672123645
mistral-medium,8.086446428860807,4.571095298725043,-7.183468766682134,7.578001334796675
mixtral-8x22b-instruct-v0.1,9.040102827583366,4.104958147821135,-6.853221720240155,6.417264929799735
gemini-pro-dev-api,-30.819601371517603,6.098960178241972,-9.981324789806113,9.971744028634259
claude-2.0,3.5458263523351645,8.046128775167288,-13.0734857680598,13.481149629027787
qwen1.5-32b-chat,15.915219017831102,5.18359856758088,-8.707914259692775,8.634802794916471
zephyr-orpo-141b-A35b-v0.1,-4.971161310988056,10.366109564107775,-17.18994478546715,16.601301987089663
mistral-next,6.608344159073803,7.234402374544968,-11.75937597904052,12.077701120752813
phi-3-medium-4k-instruct,10.283256009297455,5.865575023420566,-9.192619035757852,9.91922983663457
gpt-3.5-turbo-0613,15.948573280051576,5.2168464175035485,-8.332342973167389,8.570887772116375
qwen1.5-14b-chat,9.070388351722706,5.440529412617985,-9.099962983313173,8.801300025288608
starling-lm-7b-beta,12.52044556567859,5.606728146601813,-9.185113214763698,9.078848270349289
claude-2.1,15.149130071427168,5.003188481690446,-8.740124245700406,7.965029129741218
yi-34b-chat,-9.446836815476583,6.427637204029398,-10.742484637752478,10.47271491319368
gemini-pro,-19.319245481324728,11.077736079293988,-19.300990597656206,17.25955415391153
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,12.270667764205923,3.8901625617211257,-6.488126856277107,6.054635544395962
claude-instant-1,-1.6560049982023277,6.7608764792500695,-10.817679845490193,11.345118151148272
wizardlm-70b,-37.79268091735526,9.665999681827204,-15.304908771834146,15.650373269524273
gpt-3.5-turbo-0314,8.269934969416472,14.526142678913095,-24.104218550422356,24.33647193529718
dbrx-instruct-preview,15.072460853626103,4.554670218570719,-7.408159608355119,7.3558315510238
phi-3-small-8k-instruct,3.4406351006724765,5.824560974676816,-10.007259032644413,9.427435794105802
tulu-2-dpo-70b,-8.070007672378853,10.648108459020438,-17.96708480811928,17.032587009776723
snowflake-arctic-instruct,-19.296084690966563,4.664084190907649,-8.1217423543236,7.310896223304976
openchat-3.5-0106,6.22154089867679,6.93903391882802,-11.386279566914219,11.516991062368483
llama-2-70b-chat,-21.714286114637428,4.715822829781914,-7.658207676448345,7.722133077759832
vicuna-33b,-28.07481736336977,6.06034980485937,-9.678616886632728,9.78424340969838
starling-lm-7b-alpha,-9.901224998726708,7.879459977725989,-12.932157531458833,12.57331478444246
gemma-1.1-7b-it,-2.998388315803726,4.915598542833865,-7.96389803567228,7.781992727061031
nous-hermes-2-mixtral-8x7b-dpo,-4.401812525755093,11.867831188597462,-19.895137050114364,18.743305615449973
llama2-70b-steerlm-chat,-58.73773393149224,14.204390496171209,-22.979937224542155,23.157207012605745
openchat-3.5,-24.40088286611424,9.726223909842846,-16.30456562091591,15.415809738487415
deepseek-llm-67b-chat,1.9773771958264543,11.953224283909572,-19.674962766015376,20.105240772688283
openhermes-2.5-mistral-7b,-18.29606302579464,11.917867406457267,-19.884846404053487,20.163471877877758
qwen1.5-7b-chat,12.922069277879332,10.36657703109396,-17.73345007042797,17.953902554700704
pplx-70b-online,-49.26802262234466,9.83758346510657,-15.548325083148946,16.01164062544433
mistral-7b-instruct-v0.2,-0.061173329235548184,5.744531282347442,-9.600643382594365,9.202676906582823
gpt-3.5-turbo-1106,24.68300335404061,6.409724493288763,-10.267587941080077,10.780029392783556
phi-3-mini-4k-instruct,15.556448728775724,5.322730255306819,-8.691733474341078,8.447361000998757
llama-2-13b-chat,-15.931355587756942,6.4811337623376595,-10.754144364844702,10.492473864194396
solar-10.7b-instruct-v1.0,-15.659253859461053,12.796008399367453,-21.134897087109977,19.8325387238298
dolphin-2.2.1-mistral-7b,-40.244225018289754,20.85781812833828,-34.58482603434461,34.22470995549503
wizardlm-13b,-36.997884276903996,10.934414757490053,-17.323073590517453,18.771697469619497
zephyr-7b-beta,-23.637341937337055,8.650796276129993,-13.764378446729596,14.165590646457957
phi-3-mini-128k-instruct,-22.893535979080433,5.400990823928854,-9.228637565911184,8.73865149872779
vicuna-13b,-14.655259124751097,7.2445548363172465,-11.779297057417333,11.623084729346779
mpt-30b-chat,-20.78438905078588,17.706352500094937,-30.33649494745378,28.465923133895654
codellama-34b-instruct,-7.8147893082087,10.370898844026737,-17.214528154418556,16.865126414701226
zephyr-7b-alpha,-13.655841996308842,20.33173253731586,-32.829994014550024,31.87380720478128
codellama-70b-instruct,1.5271999090408177,21.93759200079801,-36.91043552574577,34.760835629721306
pplx-7b-online,-29.80102142243278,10.60816133060428,-17.08422708549093,17.825229913123867
gemma-7b-it,7.340215682687254,7.98382682621507,-12.811848511304468,13.671035222871431
llama-2-7b-chat,-37.623438399282904,7.042062290837486,-11.685064933122266,11.328829654184332
qwen-14b-chat,16.03784746921215,12.314987704873918,-20.494095077350945,19.580847415942788
falcon-180b-chat,-22.271358125314503,25.459517557250273,-43.4898824656474,41.99980777228048
guanaco-33b,-68.82501469155649,16.873216349607137,-27.892582821623847,28.315221267866207
gemma-1.1-2b-it,7.6711956407196285,6.899103299875821,-11.121510044033258,11.462334667736963
stripedhyena-nous-7b,-20.75109494024723,11.510607365357478,-17.90395967958434,18.801190964726434
olmo-7b-instruct,-5.004884320288225,10.408190353734426,-16.312819761542688,17.28086546826949
mistral-7b-instruct,-4.756695799191432,9.159144477420964,-14.488155560917416,15.26034571651835
palm-2,-21.718153216002523,10.35751547405348,-16.57313243457435,16.475343600120343
vicuna-7b,-26.996216911654685,11.001019004805256,-17.329753522877333,17.806817549753845
qwen1.5-4b-chat,-11.157275328467092,8.79819795550586,-14.265937580047096,15.55451290544451
gemma-2b-it,-0.22177667855857885,11.048510173658975,-18.415195538177876,18.192769786027817
koala-13b,-30.07996218337933,11.801137914783594,-18.893904940508456,18.67380089129862
chatglm3-6b,-8.546317556837453,13.626120735723905,-22.26827681719385,22.557703228232498
gpt4all-13b-snoozy,-27.819354654641455,22.02880362363692,-37.52443885430756,35.340337754346876
chatglm2-6b,-42.71529902325994,18.306282900677353,-30.479688089423234,29.900237272890415
mpt-7b-chat,-28.100461241199945,15.99739226110739,-24.663461908233398,26.119779115130477
RWKV-4-Raven-14B,-29.782996154467792,13.93043642735956,-22.16691996872484,23.67491079756942
alpaca-13b,-117.48908238143416,14.44263209446316,-24.013365431270344,22.747552534878977
oasst-pythia-12b,-26.333905890107015,12.86171652164232,-21.613829309962576,22.406911066488792
chatglm-6b,0.7939009050346995,13.65822486340841,-23.32810386685228,22.486690144611902
fastchat-t5-3b,-116.82671511782796,15.471685309717069,-25.1547900504292,25.775404772334028
stablelm-tuned-alpha-7b,9.866354593888213,16.901025486637753,-26.26722335743679,27.30666865883314
dolly-v2-12b,-77.83527755463312,17.01350062523476,-28.533326426197164,27.317563877517827
llama-13b,-134.07924327184904,21.871647400368985,-35.56043181978019,35.41177215395683
