,mean,std,lower,upper
gpt-4o-2024-05-13,52.364823198880615,5.304479996463907,-9.156400599930329,8.966019423765687
claude-3-5-sonnet-20240620,44.776295953739464,7.36170029762744,-12.072075313203001,12.085903444916333
gemini-advanced-0514,68.81552473238445,5.512025765796111,-8.930574986098193,8.718851595438771
gemini-1.5-pro-api-0514,75.53819208340983,5.472735916534396,-8.682142879203923,8.497099358194859
gpt-4-turbo-2024-04-09,50.2818210656777,4.373432681282278,-7.057644081582566,7.398552057383107
gpt-4-1106-preview,52.4681074905695,4.756915309709704,-7.89657618351891,7.457897772317061
gemini-1.5-pro-api-0409-preview,54.58600497775029,5.142521442456929,-8.51566126597828,8.335473152715053
claude-3-opus-20240229,70.34776855072263,3.985439650821416,-6.570529464100858,6.59460074164447
gpt-4-0125-preview,52.61704692328984,4.46786597682306,-7.437362165712614,7.24756506203429
yi-large-preview,83.84828731019958,5.497738733567841,-8.799879722551509,8.858156097333918
gemini-1.5-flash-api-0514,66.44551219443659,5.621662664800008,-9.300860462510826,9.420334564613157
yi-large,57.72522863369461,9.343649921911155,-15.112539281096133,15.305436553078941
gemma-2-27b-it,54.18240175159026,11.96276044872905,-20.076892418554237,20.121180404508678
bard-jan-24-gemini-pro,58.57301640437484,14.374182522989376,-22.500560177324147,24.09978754361918
glm-4-0520,89.73822924105002,10.065654819555617,-15.974794170845001,17.219888648372915
nemotron-4-340b-instruct,62.12146233331918,7.4916834791212885,-12.435135848767054,12.342341680490762
llama-3-70b-instruct,-32.238323198902386,4.0788849395017985,-6.58340146713072,6.532493659322228
claude-3-sonnet-20240229,48.48294765278086,3.8007616531793103,-6.606234685443297,6.12467153334854
reka-core-20240501,62.79253796079689,5.098024876625093,-8.478256685091935,8.348490964352322
command-r-plus,63.11575787978032,4.272882141512491,-7.029977881518349,6.9193427764251325
gpt-4-0314,53.814630343081085,5.032457535643904,-8.316853922682228,8.327684126245302
qwen-max-0428,105.41469715133273,6.9248465462619855,-11.415915838927958,11.25461147554553
qwen2-72b-instruct,106.30477152876176,6.356510488379663,-10.378025057812465,10.552339108819027
claude-3-haiku-20240307,32.565762385880035,4.106514882227224,-7.049794113911446,6.435950514993898
gemma-2-9b-it,56.49838191945181,11.669421193745967,-19.479076196897033,19.886309632717733
deepseek-coder-v2,80.1158245621619,9.648856006192677,-16.211858501823343,16.185481912128637
glm-4-0116,99.17283772014851,11.076085722907116,-17.887350374416258,18.531494884983573
qwen1.5-110b-chat,95.8979666662998,6.236427326476913,-10.130241419717407,10.376136366437692
gpt-4-0613,24.93309252635531,4.5978454109669284,-7.79612111700591,7.575705874082811
reka-flash-preview-20240611,52.173665279850745,8.076048533984777,-13.46170548402295,13.39756562624769
yi-1.5-34b-chat,109.66511015419033,6.69271683769483,-11.130965923779328,11.024825649183612
reka-flash-21b-20240226-online,46.346576344737635,6.736306505527797,-10.907933430000092,11.008148436131727
mistral-large-2402,18.22066182334493,4.708449229317513,-7.8329380931179635,7.91709393198918
llama-3-8b-instruct,-14.787939641277339,4.148186446729838,-6.786241200206117,6.704271521247977
qwen1.5-72b-chat,93.94329562813971,5.49685106319558,-9.079118661817333,9.021997785937145
claude-1,48.58906996710945,9.15444261440205,-13.732344440896036,14.390251892384711
command-r,66.03070784729209,4.558615763093069,-7.510241646399805,7.200404150138141
reka-flash-21b-20240226,45.636815578146134,6.17416791824098,-9.935494329415683,10.203889600264844
mistral-medium,19.921267833083636,6.109454127139964,-10.363249630372199,9.948144103892554
mixtral-8x22b-instruct-v0.1,39.64543800443457,5.32436112117445,-9.02404725633123,8.468248068359294
gemini-pro-dev-api,50.852362021002214,7.713394426481156,-12.617221837845285,12.607708936057044
claude-2.0,57.61718552009457,13.08064148559394,-21.03608108357559,21.75037110105368
qwen1.5-32b-chat,104.12296291643956,5.905327403765685,-9.754941003662154,9.613957057490566
zephyr-orpo-141b-A35b-v0.1,21.967561778899007,11.247891211660132,-18.08534549216289,18.79723829593095
mistral-next,12.43604988846866,10.613858859040556,-17.75745923378677,17.189368823439093
phi-3-medium-4k-instruct,36.28605059764214,7.401091053831069,-11.932511112173593,12.157685788318968
gpt-3.5-turbo-0613,40.447382854723664,8.02090127995097,-13.26234613635313,13.503704522760042
qwen1.5-14b-chat,89.40694006399609,6.003916756837281,-9.666130702628067,9.835205038923519
starling-lm-7b-beta,65.97192430510849,6.408640315051965,-10.287437660399007,10.509851253488847
claude-2.1,32.28759350925259,6.545340226895932,-11.80801878001948,10.890572476670933
yi-34b-chat,115.94136416300842,8.68841122721946,-14.278048748738883,14.653672427818279
gemini-pro,51.53094691299849,18.279542580924392,-30.15558791594202,29.428634681625415
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,29.252977873645854,4.654368249134857,-7.688142491365802,7.60889312954912
claude-instant-1,37.92419567003111,10.56307695536196,-17.1771583296605,17.474514384977887
wizardlm-70b,9.904362175775255,16.590081749960877,-27.668059050472387,27.484334835199952
gpt-3.5-turbo-0314,95.00046368700113,18.64297535727154,-30.527984417850064,30.386726930248415
dbrx-instruct-preview,29.820472533762437,5.1864237046968205,-8.497759030556875,8.2017896319031
phi-3-small-8k-instruct,21.3447344828781,7.021681512212904,-11.591705117845184,11.654604089290812
tulu-2-dpo-70b,-33.35889706487636,18.046242702069513,-31.272568552065444,28.971813656158673
snowflake-arctic-instruct,53.479182452995666,5.710757116664313,-9.577308669663033,9.213158953161873
openchat-3.5-0106,48.039032301586325,8.209500473696236,-13.033185805449634,14.045570062829867
llama-2-70b-chat,-39.316830862281186,5.772471791918989,-9.39349565886091,9.464975896498096
vicuna-33b,8.99089500875371,8.17395968274533,-12.91227416565944,13.141381333123325
starling-lm-7b-alpha,19.652243485279154,10.542095395297306,-17.11882456926639,17.458010241495376
gemma-1.1-7b-it,38.802811158216144,6.114780580233126,-9.807618503333693,9.92580407396203
nous-hermes-2-mixtral-8x7b-dpo,-8.928437984905667,29.3760619913544,-48.18111423449341,47.56609034216957
llama2-70b-steerlm-chat,14.083371819790113,22.210431700599127,-35.50255363606807,37.54065895241202
openchat-3.5,53.073741688761174,16.25983103422775,-25.73447967286897,26.77552570624815
deepseek-llm-67b-chat,102.52279724980754,20.65636877875364,-33.827521734510256,34.769661761152705
openhermes-2.5-mistral-7b,7.801106904917264,20.39428678334679,-33.28116278507048,34.635329928569014
qwen1.5-7b-chat,122.09040851120636,13.530437418855971,-23.296647406125174,22.729145646787572
pplx-70b-online,54.36524570715459,18.14144618556988,-30.5116450315872,29.580200336726335
mistral-7b-instruct-v0.2,7.980407524624432,6.822812655736664,-10.727314906313492,11.241422399373072
gpt-3.5-turbo-1106,10.198840682181883,11.443924362875764,-19.02591592244226,17.771278603246213
phi-3-mini-4k-instruct,12.47647999567965,7.246846631973919,-11.515312056326604,12.00594672519339
llama-2-13b-chat,-17.387232789707685,9.31534432516839,-15.425302357188434,15.495879219723566
solar-10.7b-instruct-v1.0,-0.17232439745303543,22.185006986420063,-35.91253125218602,37.18983489043596
dolphin-2.2.1-mistral-7b,43.680990189045296,33.300307950147825,-54.57922460369371,56.31482814413071
wizardlm-13b,27.839354063597995,15.97513548914644,-26.001331740866945,26.072374136854485
zephyr-7b-beta,-31.102895596078433,14.83629598496864,-24.348117824599672,24.86341036379614
phi-3-mini-128k-instruct,47.03171749224002,6.857269445703416,-11.41443088616824,11.103722837960753
vicuna-13b,63.15986617796399,9.534386538213708,-16.310771672528745,15.512813043651967
mpt-30b-chat,-3.2081157869581203,28.935862708554104,-48.268966981605985,48.365756397278204
codellama-34b-instruct,-17.004508842287443,18.15769461065981,-29.410330131221293,31.500485794591334
zephyr-7b-alpha,1.0464389795959148,40.90819388751517,-68.44726099151694,68.52963385994698
codellama-70b-instruct,62.01152972625948,23.73860287844652,-37.184636249315474,39.82981796486784
pplx-7b-online,46.949168204609464,19.81629947157079,-31.801022096609046,33.26262325244073
gemma-7b-it,64.16772839389415,9.607511166374534,-15.584773605305784,15.963837160178102
llama-2-7b-chat,-10.686846589295156,9.56257564822095,-15.602122045921618,15.890944143895043
qwen-14b-chat,93.80998603024732,20.991142237323956,-36.46213590910402,34.709460429877375
falcon-180b-chat,-5.762971128703682,53.19949557282576,-103.14212305812337,79.85071180256274
guanaco-33b,-8.09555815830848,23.291251364976976,-37.76726215185122,37.54456764540064
gemma-1.1-2b-it,54.351836930406584,8.772176491255761,-13.83627077453557,13.944691080311536
stripedhyena-nous-7b,10.956356978363244,22.361595670923666,-36.716367053430794,36.251855431765826
olmo-7b-instruct,61.99822404568508,11.423583895951367,-18.549270948133596,18.374305263458126
mistral-7b-instruct,-8.40513127618666,15.8716333921279,-25.709062695983782,26.145434869041388
palm-2,-68.93408785379529,18.572847793529373,-32.35878254630404,30.618779157479864
vicuna-7b,30.0455454956331,16.37847991687607,-27.84211132205308,25.628271500667665
qwen1.5-4b-chat,92.00588646236545,9.805815145241958,-16.462167195145398,15.782024078471025
gemma-2b-it,67.15661119781697,12.59741701050154,-21.289676234193536,20.799793356078126
koala-13b,-34.037983818954366,15.551026115861486,-25.665537474910046,25.862239339987646
chatglm3-6b,158.85782838010866,20.475298978253733,-35.23476397533261,33.06094784649865
gpt4all-13b-snoozy,-5.710223825281905,29.18391481761224,-44.91693876866041,48.80482016026378
chatglm2-6b,119.92299915994177,29.43522064933557,-46.265978539892544,47.270029128900575
mpt-7b-chat,73.27143677551364,19.88335872323541,-32.0391276050576,33.96824723286405
RWKV-4-Raven-14B,28.901876466954825,18.836499850910243,-30.944947792825864,31.58661997526245
alpaca-13b,-64.5351201385538,18.67757520329951,-30.913494533320097,30.449823264003967
oasst-pythia-12b,-40.61042794551554,16.475487809213202,-26.569310878896076,26.81138476792414
chatglm-6b,251.6744014300437,18.971423123155322,-30.330698010837125,30.02984323740455
fastchat-t5-3b,22.32561868385652,189.15740721051597,-250.30577511952708,172.32704917092968
stablelm-tuned-alpha-7b,52.12898343974239,20.744275255908104,-31.794270438320304,35.88249274914671
dolly-v2-12b,52.99556551569829,21.99638764060825,-35.18510773641181,35.245245994446435
llama-13b,47.32258214320096,21.866309110925492,-36.18384047920513,35.67958279901569
