,mean,std,lower,upper
gpt-4o-2024-05-13,-15.59112563479105,13.581396154710315,-21.823551311698175,22.790847253322326
claude-3-5-sonnet-20240620,0.436291657640598,21.1516620224576,-35.12008286573721,34.42726375616545
gemini-advanced-0514,-1.7109406229418127,14.858023499421499,-24.966457449324135,23.710627529652267
gemini-1.5-pro-api-0514,-28.205016067983728,14.149455966278842,-23.150820605858787,23.660572660189857
gpt-4-turbo-2024-04-09,-4.343367392372498,11.958316455171126,-18.584726012459697,20.408263261233166
gpt-4-1106-preview,-1.32240746279765,10.745428510291234,-17.39593639521846,17.651368597101385
gemini-1.5-pro-api-0409-preview,13.22381861548864,12.776589321689759,-20.79465142671279,20.897894910500106
claude-3-opus-20240229,-8.264458170725286,9.990478658965277,-17.41430261912723,16.024580970225458
gpt-4-0125-preview,-4.169918394947695,10.99126894764338,-18.110737163949757,18.149051562804118
yi-large-preview,-32.209433157114596,13.67569679426097,-21.70932898679853,23.565979716304028
gemini-1.5-flash-api-0514,-39.161614353745826,14.805207326699795,-24.05313263382356,24.115549023335618
yi-large,-53.964197265399555,22.894135373782916,-36.57910002026864,39.18739506681397
gemma-2-27b-it,-54.0131420670266,29.27053774447866,-49.378278377183236,46.687718867969714
bard-jan-24-gemini-pro,60.42286998893381,20.810384255126497,-35.62640331428207,31.93732895184566
glm-4-0520,-46.220442999028,26.0589338568614,-43.2537993613169,40.24409133737196
nemotron-4-340b-instruct,-63.32654324402161,19.51462867913099,-32.196374398376236,32.664419870802845
llama-3-70b-instruct,-8.218844448367658,9.774202181397417,-16.57608297659082,16.206938312680276
claude-3-sonnet-20240229,-20.294308416861615,10.056392953802538,-16.409544563339225,16.041749642879584
reka-core-20240501,-13.182806819113983,12.979619888910744,-21.977168064751588,20.284238858002283
command-r-plus,-24.117087754250147,11.556900300927143,-18.727053838506993,18.34560238141349
gpt-4-0314,-2.365920772797038,12.751038325541193,-20.812440418038975,20.510544177422403
qwen-max-0428,-12.148154710262377,15.993147400224439,-26.57609660635159,25.904882038872778
qwen2-72b-instruct,-48.55588867865308,15.058081575532844,-25.02278708787961,24.103813696051056
claude-3-haiku-20240307,-26.923889872223203,10.664871253666258,-17.4440596987589,17.6111001959309
gemma-2-9b-it,-63.28242859919779,29.77467909441948,-48.5070501726307,50.01676705186987
deepseek-coder-v2,-31.53240778474511,23.95480829414151,-41.44237535253292,39.68802391434338
glm-4-0116,-35.76550064880622,29.9030247317475,-47.528347949413096,47.649890135411674
qwen1.5-110b-chat,-46.73842797964313,16.94550286723502,-26.94103928485324,28.280667178500345
gpt-4-0613,-36.12056143765758,10.897216191223944,-17.68016053217421,18.483099902783806
reka-flash-preview-20240611,-29.358164134498306,20.018046650478574,-33.44926286410667,33.31667202253159
yi-1.5-34b-chat,-48.115642613567644,17.053482332924016,-27.51932977743558,28.733116404784326
reka-flash-21b-20240226-online,-40.13921531594241,20.468397813394756,-33.091925814559524,32.89180589274497
mistral-large-2402,11.226694375026447,11.544763760221333,-19.116108574862213,19.140614093084633
llama-3-8b-instruct,-28.713824573789214,10.819403098605537,-17.64762285508253,17.95729957419314
qwen1.5-72b-chat,-30.63023755039826,12.084497807997685,-18.86580422644593,19.27967777191557
claude-1,-30.83925769778094,18.126160702198927,-28.749969570783303,30.090451206675223
command-r,-30.448110976026502,12.74376446374989,-20.69143674910811,20.939927840729045
reka-flash-21b-20240226,-19.662313028215422,15.788767505506106,-25.240889065250755,25.747088556223076
mistral-medium,-9.328619618960797,12.899972362609406,-21.04304559282756,21.47210434609029
mixtral-8x22b-instruct-v0.1,-27.75060231797571,12.978068336063748,-20.752617441712932,21.843341982832523
gemini-pro-dev-api,-26.35290538407778,15.154098689270484,-23.941462356964017,24.736004712998913
claude-2.0,-24.647535653944747,21.713587278438904,-36.329392594346174,34.871303647995134
qwen1.5-32b-chat,-52.34695960071405,17.33308951824699,-28.460793225118746,28.05358690757192
zephyr-orpo-141b-A35b-v0.1,15.7354103968183,33.01449878054921,-53.157555337828605,56.614049177323096
mistral-next,-10.740288058777795,14.44824407378883,-23.560067295262165,23.784259951957885
phi-3-medium-4k-instruct,-59.117263963747014,21.91195502307432,-37.29012046249832,35.23465783765747
gpt-3.5-turbo-0613,-15.724334992819761,12.819461266747457,-21.459034118946665,21.015852254344715
qwen1.5-14b-chat,-44.295714067826935,19.718085331381925,-32.467274475147775,34.30212942683731
starling-lm-7b-beta,-49.35104335056971,20.381906654157163,-33.22385292534132,33.09899750318837
claude-2.1,23.89126650220373,13.702073350188922,-23.267540805137106,22.43866613555959
yi-34b-chat,-63.639149042297085,19.85492517184216,-33.027570290462734,33.57097503177925
gemini-pro,-51.93783848421686,27.712370986776907,-46.710563338692076,44.754983124774434
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,-33.51781884783275,11.109883143339614,-18.49144764423346,17.67672011892806
claude-instant-1,-28.308693013855862,16.31768507365164,-27.076934638229407,27.31154890651185
wizardlm-70b,-61.92389764023665,27.180157674312827,-42.411872598188154,44.523918409572296
gpt-3.5-turbo-0314,14.116912167901836,49.25445932110786,-85.18666601681691,78.37459907732155
dbrx-instruct-preview,-54.74336760771453,15.112658624518602,-24.31870936710066,24.93966286301415
phi-3-small-8k-instruct,-12.347181407591313,19.627196926941227,-29.69800439113576,34.45749390703416
tulu-2-dpo-70b,-10.173152444436024,23.17653876889597,-35.60859647290239,39.39349228632928
snowflake-arctic-instruct,-1.0323793713309164,14.729286221061022,-23.640127274148625,24.562133774327613
openchat-3.5-0106,-46.0161225848372,21.664526124979243,-37.994508580436175,33.40601823933583
llama-2-70b-chat,-42.66825683786493,13.335188191279089,-21.782557553930804,21.387926065565892
vicuna-33b,-34.41470954922635,16.765136813533914,-25.929641942986056,27.132607649752316
starling-lm-7b-alpha,-48.36861306818559,20.747654861330005,-32.08004944579082,33.566546056278476
gemma-1.1-7b-it,-64.62695303350496,16.23319976791716,-26.394655900648125,25.845322230200125
nous-hermes-2-mixtral-8x7b-dpo,-1.1543362167110995,27.24306726888717,-45.41705719065789,46.59972713844844
llama2-70b-steerlm-chat,-26.351568120261387,27.54405557383364,-46.259388167185705,43.94406805224719
openchat-3.5,-13.68054570187555,23.80835176215172,-38.88993531560321,37.52880223497674
deepseek-llm-67b-chat,-72.45567505957541,29.799902680258008,-49.164429883024994,50.92508750784215
openhermes-2.5-mistral-7b,-45.09581181991246,26.654215626392375,-43.80167164794479,42.93763969654639
qwen1.5-7b-chat,-53.01462280702579,32.42747897898609,-52.61904895582904,53.67289649449627
pplx-70b-online,-28.77872449859757,28.03893364238569,-46.796706938773866,48.021386709141495
mistral-7b-instruct-v0.2,-88.52994511476103,16.23722894598093,-26.151910255095032,26.43042113056977
gpt-3.5-turbo-1106,-46.978780169401496,18.071189246334406,-29.54495354690907,28.89823553196461
phi-3-mini-4k-instruct,-44.54578863752158,18.478893160584366,-31.060729399365215,29.767779152439502
llama-2-13b-chat,-65.23379279729558,18.9091558794702,-31.22544509633253,31.400417168435453
solar-10.7b-instruct-v1.0,-61.11568864432242,43.28901457511926,-69.72374731103434,69.73859559960154
dolphin-2.2.1-mistral-7b,-28.894867714723947,55.64368438277313,-57.01327952352905,119.56687645973088
wizardlm-13b,-32.28736720976858,33.85529062786176,-55.1242506606052,55.27157053562863
zephyr-7b-beta,-65.83324657225897,22.15266798136857,-37.302976317387916,35.96229321040471
phi-3-mini-128k-instruct,-49.43457585486475,17.23048254227865,-27.55310093996911,27.94489277826193
vicuna-13b,-52.51547207118933,20.133989497109976,-34.379608459399314,33.707715526886226
mpt-30b-chat,-36.6679811672331,59.39344599740548,-79.95024053324732,105.85403015431575
codellama-34b-instruct,-31.72417310375357,27.269793237923473,-43.04420594671843,45.97220619166727
zephyr-7b-alpha,-35.76001199716225,60.35223508882109,-88.01297692494322,97.13720552882856
codellama-70b-instruct,23.603498084842435,12.16714044269657,-18.76185442504235,19.634531385379177
pplx-7b-online,-115.01623735713862,31.544634809862973,-50.94952777412257,53.71255191017626
gemma-7b-it,-54.573079006172016,23.969799103814168,-38.30972715090576,38.5080490087773
llama-2-7b-chat,-110.69992140326492,20.561194135923465,-32.64877907874664,32.9722655910612
qwen-14b-chat,2.6326378759354676,33.6097338242258,-53.376699165168944,54.439254404939845
falcon-180b-chat,31.47929253630939,12.160334990510606,-19.043341232624844,20.40261532162227
guanaco-33b,34.74836738720769,42.266581975037965,-74.52231575473363,74.9308493429557
gemma-1.1-2b-it,-62.58200862881364,25.80673068808925,-43.059664690047256,41.77184422654784
stripedhyena-nous-7b,-46.9663101847988,24.31447135380829,-38.95661650679662,40.66774830496533
olmo-7b-instruct,-117.88373460946305,67.25193873973002,-169.57308276623212,67.14732361432918
mistral-7b-instruct,-112.39450231292795,23.659287858534363,-37.16092402772887,39.661336376030164
palm-2,-110.14858745306078,37.233805494927765,-60.02469901695657,61.236518278844486
vicuna-7b,18.36982240020672,33.30373270763772,-56.31057722414609,52.30232891817663
qwen1.5-4b-chat,-69.52786026896784,29.078890897478516,-47.59982007061342,47.6073724537043
gemma-2b-it,-140.06116546851078,40.99309809388411,-59.3281299176156,62.49927855581316
koala-13b,-92.83482547982567,42.723884832332445,-63.12735444194527,72.64863422741922
chatglm3-6b,-56.034924353599216,134.35178749496535,-176.7324490291949,157.08159636456588
gpt4all-13b-snoozy,129.82410787319168,11.882104184955002,,
chatglm2-6b,134.86369433764332,10.722442264938229,-17.125360709014274,18.25398188815558
mpt-7b-chat,33.055636793869766,97.12832946044712,-114.82789191163634,143.6543964536839
RWKV-4-Raven-14B,-42.92990522841536,52.57698886916364,-78.62055948534348,85.32868968535453
alpaca-13b,-116.5764902035084,52.25201140455559,-76.5503618856718,79.18899717387484
oasst-pythia-12b,-27.622639004236714,41.02249205129322,-67.72430079568758,67.25677817305333
chatglm-6b,182.13865880156675,10.036857816869558,-17.475064777216403,15.796993171110529
fastchat-t5-3b,-58.73162833275951,69.80411853259514,-96.53380366168957,90.46438652847338
stablelm-tuned-alpha-7b,28.745392218723545,195.7107877412751,-206.02592257310664,260.4079710502897
dolly-v2-12b,-9.156489226946928,56.590977802965654,-87.8209463222253,88.11336524553565
llama-13b,71.29856629602402,181.05804485017157,-209.43823940850388,242.25941803190028
