model_a,mean,std,lower,upper
gpt-4o-2024-05-13,-18.69613012458005,2.6725768538860444,-4.418311772601392,4.300996536340463
claude-3-5-sonnet-20240620,-23.27098339690796,4.149777749473215,-6.93522852463267,6.853985355535848
gemini-advanced-0514,-28.15665745011227,3.045854182020185,-5.099266980236077,4.799575178459172
gemini-1.5-pro-api-0514,-25.147296420705608,2.918054350406903,-4.880667183599531,4.763581075142405
gpt-4-turbo-2024-04-09,-11.754161703409574,2.349861806807217,-3.991842067650161,3.65989007473749
gpt-4-1106-preview,-17.360738698221063,2.238681575053216,-3.9450435235276,3.437234314209242
gemini-1.5-pro-api-0409-preview,-16.380111843677675,2.57042804173665,-4.009359404670386,4.332234900599392
claude-3-opus-20240229,-31.1346702956104,2.018593956014335,-3.4383656246550487,3.357490571401179
gpt-4-0125-preview,-16.165620700783982,2.282958046222015,-3.5965375289237027,3.746640453242197
yi-large-preview,-18.9773497199168,2.846075488037934,-4.423038039010358,4.683871390457728
gemini-1.5-flash-api-0514,-24.613775608443643,3.030982602556553,-4.6826514953103775,5.0054006222901535
yi-large,-15.589450451266758,5.104039616465396,-8.17613984000786,8.343507421114568
gemma-2-27b-it,-20.282818948827185,6.490684700206288,-10.977193082240557,10.137718287545532
bard-jan-24-gemini-pro,-25.257113767468496,4.366852474571798,-6.96923645289279,6.986310662870878
glm-4-0520,-14.720795796529941,5.173263258459942,-8.696850029231754,8.224675894759915
nemotron-4-340b-instruct,-21.672900525889162,4.068919279152346,-6.4351854486657825,6.393381494795399
llama-3-70b-instruct,21.810620960066807,2.0893161474807536,-3.3985045883542035,3.280701844052757
claude-3-sonnet-20240229,-23.311517694761815,2.106647248755252,-3.355290829625961,3.5133643418266125
reka-core-20240501,-20.385755695315574,2.491261202642235,-4.3004637011346425,4.02537060026302
command-r-plus,-25.34875173265833,2.2406776502802668,-3.6549005674616737,3.5933387637349163
gpt-4-0314,-22.979889198027347,2.49557857383184,-4.103892716806943,4.156568173849706
qwen-max-0428,-16.574974948504796,3.0119653109386357,-4.901247216585823,5.141223367188953
qwen2-72b-instruct,-17.27494796328347,3.3344819654065687,-5.285949603658823,5.523298531532063
claude-3-haiku-20240307,-22.551619660466315,2.2590079350280763,-3.7577134409644124,3.5608519568934653
gemma-2-9b-it,-23.941600460700226,6.28298516172925,-10.07288969535044,10.52542627689903
deepseek-coder-v2,-27.754391356321133,5.066872989077811,-8.13342155163022,8.297675748428635
glm-4-0116,2.340641675512885,5.620319865116469,-9.106888273800436,9.025153949583549
qwen1.5-110b-chat,-13.468869302685915,3.0090350340034635,-5.082503535108106,4.762142554644081
gpt-4-0613,-18.013071583875732,2.2431016408560005,-3.755449070001532,3.546042724816445
reka-flash-preview-20240611,-21.502606702049448,4.6003000877218,-7.72181434933117,7.7332811968381385
yi-1.5-34b-chat,4.794922259455883,3.6032372397022407,-5.787681071671677,5.747660867274147
reka-flash-21b-20240226-online,-19.01992108427154,3.900401902819417,-6.65650161846945,6.087380774581742
mistral-large-2402,-8.544444089249582,2.472080020556679,-4.176786112326435,3.9863105691737992
llama-3-8b-instruct,10.501808981845066,2.230507454449282,-3.7449878276626922,3.6300452198878688
qwen1.5-72b-chat,-15.581496914182308,2.581220658437057,-4.254337422594002,4.2888540302496505
claude-1,-20.834741863923874,3.5263028136931918,-5.759638547763016,5.825755869549745
command-r,-25.637835753939843,2.5886698205511274,-4.296005038667797,4.201308468093689
reka-flash-21b-20240226,-20.233562278157528,3.1721481464543277,-5.148375796095857,5.080747145436291
mistral-medium,-8.525015840941979,2.843922597674801,-4.943522952819398,4.687095795012715
mixtral-8x22b-instruct-v0.1,-9.953083682653721,2.6105191969259773,-4.216613051348928,4.429021953193114
gemini-pro-dev-api,-26.258274185620458,3.7292939795407847,-5.839481147881095,6.266680655981652
claude-2.0,-18.500483453777367,4.210767241479991,-6.8229965169697735,6.928433596113603
qwen1.5-32b-chat,-20.71118274172726,3.3132219668269647,-5.222743001980131,5.503553783081735
zephyr-orpo-141b-A35b-v0.1,-8.839276892013276,6.518329508041548,-10.58305227046402,11.440048449491679
mistral-next,-15.990142682412344,4.111544218704788,-6.878818439534607,6.495950669848192
phi-3-medium-4k-instruct,-7.041110070668478,4.226265234908128,-6.554312112012937,6.957543117039498
gpt-3.5-turbo-0613,-21.085785292976567,2.7803258314004693,-4.523643299016527,4.496701796282132
qwen1.5-14b-chat,-19.16202141415288,3.626128311161297,-6.065771353310845,6.007019714689704
starling-lm-7b-beta,-12.152412009573206,3.9022767091173494,-6.298266132210206,6.446495541307312
claude-2.1,-22.151227083951774,2.834485138459762,-4.5166727048994915,4.610087657559408
yi-34b-chat,-8.85839343855108,3.882382352372168,-6.277395449697856,6.4860908010381095
gemini-pro,-15.625177842916383,5.713737418990998,-9.357065307333997,9.638146395195356
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,-23.946416325665535,2.367627537981203,-3.9526605839758915,3.952844396775159
claude-instant-1,-15.543791063461041,3.663385460636259,-5.786384762881168,6.078965306504822
wizardlm-70b,-11.163301008557132,5.089054100721465,-8.672403324366257,8.226869491203361
gpt-3.5-turbo-0314,-26.487512733102285,6.909347492032962,-11.765986905697119,11.24634065903853
dbrx-instruct-preview,-1.6168766099180834,2.880289042933459,-4.667205541457164,4.655627322521459
phi-3-small-8k-instruct,1.6924362021186772,4.15662766804728,-6.7219327672351215,6.928428252476772
tulu-2-dpo-70b,-10.764505638627957,5.562953434452887,-9.28947058821866,8.52168658320865
snowflake-arctic-instruct,-19.696384669813813,2.8397370206513353,-4.640712338770754,4.708241034377988
openchat-3.5-0106,-15.31699077518252,4.014967631670285,-6.679669598506829,6.339491055290287
llama-2-70b-chat,0.9545197285318918,2.7599225683183426,-4.654015019344119,4.446616847420904
vicuna-33b,-7.203835366841231,3.3985360479174256,-5.498309820177617,5.211381127885424
starling-lm-7b-alpha,-7.309679212790645,4.632997940657201,-7.849490324377182,7.447460818412189
gemma-1.1-7b-it,-7.7352057327043475,3.373663295657972,-5.3679751674267,5.447491643680447
nous-hermes-2-mixtral-8x7b-dpo,-8.995647017687201,6.779826229887229,-11.401442773807695,11.433954069199991
llama2-70b-steerlm-chat,-13.59606217996723,7.819841411774045,-12.649301064156194,12.8585997963951
openchat-3.5,-14.450899395698876,5.026687732829702,-8.679592306807045,7.847849241093115
deepseek-llm-67b-chat,-14.582628406747205,6.498052138798972,-10.75578554349087,10.349209272543924
openhermes-2.5-mistral-7b,-5.972001289659565,5.977132238742002,-9.92053717170282,9.421489001015324
qwen1.5-7b-chat,-21.67773519280868,6.293844735927583,-10.064588610440337,10.082603352825537
pplx-70b-online,-11.683898882536699,5.626107400210292,-9.393459069287008,9.173017483530916
mistral-7b-instruct-v0.2,6.887615774360395,3.4311124081784703,-5.687237743995317,5.741311770720235
gpt-3.5-turbo-1106,-18.426571993424776,3.7965452784370095,-5.934180552513444,6.182369603736149
phi-3-mini-4k-instruct,3.340713881861284,3.804914448307796,-6.101239429024854,6.126926189493092
llama-2-13b-chat,-6.258607908439128,3.6499127599428562,-6.25428772562524,5.832348521876369
solar-10.7b-instruct-v1.0,-4.687663003457206,6.741817357236169,-11.147781273987524,10.425066350359849
dolphin-2.2.1-mistral-7b,-5.424317261859006,10.676262103329094,-17.200354658359366,16.59786257552556
wizardlm-13b,-11.840839673248542,5.383271532641857,-8.745545357567103,8.852829810214011
zephyr-7b-beta,-2.385059053801906,4.399121294500913,-7.107947054179707,7.581329000851421
phi-3-mini-128k-instruct,-15.04142813078439,3.419834604763429,-5.447200548437829,5.6799135581159135
vicuna-13b,-18.55704022697939,3.7170740036563377,-6.2459528174709735,6.185899375596573
mpt-30b-chat,-5.205169193906693,8.698730076799274,-15.059612540939902,14.028535611267808
codellama-34b-instruct,-12.545412124831163,5.218761723123669,-8.543706159976491,8.492916046669844
zephyr-7b-alpha,-6.299604669316094,9.975861721246135,-16.36196738685773,16.128925660592003
codellama-70b-instruct,-3.3404426745870115,13.263256486033502,-21.98086161096508,22.816697225927584
pplx-7b-online,-6.194482932292861,6.041166457825659,-9.625700491718959,10.195765795116394
gemma-7b-it,-7.912859669729636,4.893229712681016,-8.028630734072228,8.281622058238524
llama-2-7b-chat,0.5336282878683515,3.999226159313417,-6.453476586416636,6.324755219516841
qwen-14b-chat,-21.410760375397313,6.383103894814916,-10.558838952288161,10.343163520437304
falcon-180b-chat,-11.648456616755128,12.642711584159667,-21.23850706723842,20.623397951499626
guanaco-33b,-7.025871769239286,8.041639566398404,-13.74551089210822,13.010106796371945
gemma-1.1-2b-it,-14.65270497661707,4.880545637826014,-8.031968909213267,8.041811360563692
stripedhyena-nous-7b,-8.243490267547935,6.047482911337896,-9.907041796648558,9.87504639363503
olmo-7b-instruct,0.3109708797473535,5.955741915270669,-9.661524637930167,9.932298341743868
mistral-7b-instruct,-3.772513227113473,5.011974434298944,-8.413178626308138,8.104473116155049
palm-2,-1.0977318784854415,5.306840663635579,-8.15337521372718,8.700835230483264
vicuna-7b,-20.15560340351819,5.71480304615885,-9.206936363443713,9.38134862913876
qwen1.5-4b-chat,-30.190381530048896,5.538129131962231,-8.997884020448598,9.162754992332161
gemma-2b-it,-11.47546665102681,6.835218562717903,-11.207897885263856,10.676142979926869
koala-13b,-6.146985215050621,5.88284458094275,-9.453732393568696,9.869325047523944
chatglm3-6b,-10.530413031288685,7.023458712450038,-11.246591211474998,11.622539954599251
gpt4all-13b-snoozy,-3.95823015716804,10.841829949658779,-17.016487712793314,18.21323489379928
chatglm2-6b,-7.2793422185943095,9.203159474659714,-15.008071699324711,14.934267652326877
mpt-7b-chat,-15.20074371780374,7.2317416911270485,-11.920899284786596,11.439772329360288
RWKV-4-Raven-14B,-19.45857212572829,6.958847561012714,-11.506425867488527,11.392289504660823
alpaca-13b,-13.058939920211795,6.163376463057585,-10.072878709018108,10.319405002461746
oasst-pythia-12b,-9.406151512604684,6.589220843646118,-10.907125238521738,11.507581525802724
chatglm-6b,-25.01054242844269,6.987078535440326,-11.605601075322664,10.78018161581111
fastchat-t5-3b,-1.4314460441573755,7.301826636422237,-12.03678886296328,11.694855469061293
stablelm-tuned-alpha-7b,-13.224166716991611,8.32819145877737,-14.138102467639076,13.259305562525427
dolly-v2-12b,-20.217967652601097,8.494181914818913,-13.519124976779892,14.014944067590854
llama-13b,-23.19749684157547,10.30088183191176,-16.654524953428833,16.560288930864857
