model_a,mean,std,lower,upper
gpt-4o-2024-05-13,-18.696130124574022,2.672576853881543,-4.418311772616065,4.300996536347174
claude-3-5-sonnet-20240620,-23.270983396913085,4.14977774946143,-6.9352285274623675,6.8539853555292645
gemini-advanced-0514,-28.156657450108284,3.0458541820057534,-5.09926698024179,4.799575178524847
gemini-1.5-pro-api-0514,-25.14729642070119,2.9180543503999594,-4.880667183663949,4.76358107519712
gpt-4-turbo-2024-04-09,-11.754161703405584,2.34986180681051,-3.991842067638384,3.6598900746217886
gpt-4-1106-preview,-17.360738698212735,2.2386815750516793,-3.9450435234938066,3.4372343137609818
gemini-1.5-pro-api-0409-preview,-16.38011184367722,2.570428041736612,-4.0093594046767524,4.3322349005088405
claude-3-opus-20240229,-31.134670295559197,2.0185939560289072,-3.4383656246038576,3.357490571567091
gpt-4-0125-preview,-16.165620700761906,2.2829580462128876,-3.5965375289139736,3.746640452364648
yi-large-preview,-18.977349719909842,2.8460754880208383,-4.423038038965032,4.683871390511374
gemini-1.5-flash-api-0514,-24.613775608439955,3.0309826025467,-4.682651495221673,5.005400621513747
yi-large,-15.589450451268739,5.1040396164701995,-8.176139839973256,8.343507420845377
gemma-2-27b-it,-20.282818948837555,6.490684700187914,-10.977193082250881,10.137718287228424
bard-jan-24-gemini-pro,-25.257113767463792,4.3668524745545545,-6.969236452880956,6.986310662881728
glm-4-0520,-14.720795796497551,5.173263258435824,-8.696850029199581,8.224675893120107
nemotron-4-340b-instruct,-21.672900525862953,4.068919279120758,-6.435185448463972,6.393381494847048
llama-3-70b-instruct,21.81062096003999,2.089316147456667,-3.3985045883843625,3.2807018444420137
claude-3-sonnet-20240229,-23.311517694760493,2.106647248803783,-3.355290829643195,3.5133643418303215
reka-core-20240501,-20.385755695308127,2.491261202645846,-4.3004637012040945,4.02537060027133
command-r-plus,-25.348751732663395,2.2406776502766705,-3.6549005674872923,3.593338763725928
gpt-4-0314,-22.97988919802595,2.4955785738317435,-4.103892716805433,4.156568173850456
qwen-max-0428,-16.574974948493487,3.0119653109443654,-4.901247216673887,5.1412233671368135
qwen2-72b-instruct,-17.274947963272005,3.334481965381135,-5.285949604651236,5.52329853139171
claude-3-haiku-20240307,-22.551619660447766,2.259007935012493,-3.757713440968228,3.5608519568678467
gemma-2-9b-it,-23.94160046073393,6.282985161741446,-10.072889695375391,10.52542627686043
deepseek-coder-v2,-27.75439135631692,5.066872989080197,-8.13342155350475,8.29767574845
glm-4-0116,2.3406416755499877,5.620319865112067,-9.106888272612446,9.025153949526917
qwen1.5-110b-chat,-13.468869302675365,3.0090350339793415,-5.082503535094396,4.762142554626436
gpt-4-0613,-18.013071583905628,2.2431016408659628,-3.75544907002147,3.5460427247900164
reka-flash-preview-20240611,-21.502606702036132,4.600300087719806,-7.721814349324596,7.733281196654481
yi-1.5-34b-chat,4.794922259413686,3.6032372396826946,-5.7876810717088505,5.747660867621839
reka-flash-21b-20240226-online,-19.0199210842924,3.9004019028260783,-6.65650161851034,6.087380774534314
mistral-large-2402,-8.544444089247522,2.4720800205541678,-4.176786112381616,3.9863105692158207
llama-3-8b-instruct,10.50180898183498,2.2305074544528436,-3.744987827672791,3.630045219881625
qwen1.5-72b-chat,-15.581496914177501,2.581220658435056,-4.254337422588332,4.2888540302536295
claude-1,-20.834741863963764,3.5263028137287944,-5.759638547752564,5.825755869100174
command-r,-25.637835753935835,2.588669820551975,-4.296005038668554,4.201308468097867
reka-flash-21b-20240226,-20.233562278144557,3.1721481464514607,-5.148375795402869,5.080747145451422
mistral-medium,-8.525015840940329,2.843922597674048,-4.943522952871886,4.687095794105666
mixtral-8x22b-instruct-v0.1,-9.95308368264587,2.6105191969303525,-4.216613051340486,4.429021953180218
gemini-pro-dev-api,-26.258274185634953,3.7292939795442592,-5.839481147631371,6.266680656020789
claude-2.0,-18.500483453798015,4.210767241518842,-6.8229965171217515,6.928433595996378
qwen1.5-32b-chat,-20.711182741725242,3.313221966793422,-5.2227430019643,5.503553783102021
zephyr-orpo-141b-A35b-v0.1,-8.839276892007447,6.518329508026195,-10.583052270457726,11.440048449496803
mistral-next,-15.99014268237607,4.111544218661741,-6.878818439950637,6.4959506689198
phi-3-medium-4k-instruct,-7.04111007062254,4.226265234903271,-6.5543121119561985,6.957543117080719
gpt-3.5-turbo-0613,-21.085785292984617,2.780325831401962,-4.523643299078337,4.496701796415337
qwen1.5-14b-chat,-19.16202141415317,3.6261283111606977,-6.065771353299553,6.007019714690525
starling-lm-7b-beta,-12.152412009575556,3.9022767091268484,-6.298266132212966,6.446495541304746
claude-2.1,-22.151227083959327,2.8344851384585033,-4.516672704907364,4.610087657548036
yi-34b-chat,-8.858393438563839,3.882382352382147,-6.2773954497128095,6.486090801077408
gemini-pro,-15.62517784292338,5.713737418979788,-9.357065307343802,9.63814639518687
mixtral-8x7b-instruct-v0.1,0.0,0.0,0.0,0.0
gpt-3.5-turbo-0125,-23.94641632565989,2.3676275379783522,-3.952660583977398,3.9528443967489366
claude-instant-1,-15.543791063475572,3.6633854606465968,-5.786384762891867,6.078965305976618
wizardlm-70b,-11.163301008594026,5.089054100704179,-8.672403324135509,8.226869491167935
gpt-3.5-turbo-0314,-26.487512733189536,6.909347491987968,-11.76598690345952,11.24634065567643
dbrx-instruct-preview,-1.61687660991569,2.880289042946051,-4.667205541374314,4.655627322445852
phi-3-small-8k-instruct,1.6924362021404757,4.156627668053711,-6.7219327672128575,6.928428252497763
tulu-2-dpo-70b,-10.76450563863022,5.562953434454933,-9.289470588203095,8.521686583189107
snowflake-arctic-instruct,-19.69638466980846,2.8397370206585486,-4.640712338695426,4.708241034391094
openchat-3.5-0106,-15.316990775146902,4.014967631673593,-6.679669598495609,6.339491055196952
llama-2-70b-chat,0.9545197285315694,2.759922568319181,-4.654015019303344,4.446616847420809
vicuna-33b,-7.2038353668542765,3.3985360479103104,-5.49830982025776,5.21138112788543
starling-lm-7b-alpha,-7.309679212800926,4.632997940656997,-7.849490324394843,7.447460818382957
gemma-1.1-7b-it,-7.73520573269354,3.373663295661983,-5.367975167412584,5.447491643690844
nous-hermes-2-mixtral-8x7b-dpo,-8.99564701771288,6.779826229920671,-11.401442773817685,11.433954069174643
llama2-70b-steerlm-chat,-13.596062179943477,7.819841411743831,-12.64930106410468,12.85859979641641
openchat-3.5,-14.45089939572103,5.0266877328341,-8.67959230696194,7.847849241073792
deepseek-llm-67b-chat,-14.582628406755934,6.498052138804549,-10.755785543392687,10.349209272535195
openhermes-2.5-mistral-7b,-5.972001289709234,5.977132238731802,-9.920537171763142,9.421489000965678
qwen1.5-7b-chat,-21.67773519280716,6.293844735965429,-10.064588610439294,10.082603352826293
pplx-70b-online,-11.683898882538022,5.626107400215956,-9.393459069288422,9.173017483530481
mistral-7b-instruct-v0.2,6.887615774367136,3.4311124081800477,-5.6872377441975885,5.741311770737083
gpt-3.5-turbo-1106,-18.42657199342783,3.7965452784319975,-5.934180552516841,6.1823696037308515
phi-3-mini-4k-instruct,3.34071388189123,3.8049144482819415,-6.101239428994317,6.126926189398824
llama-2-13b-chat,-6.2586079084372335,3.6499127599328935,-6.254287725621833,5.832348521872842
solar-10.7b-instruct-v1.0,-4.6876630034714415,6.741817357232051,-11.147781274068802,10.425066350435095
dolphin-2.2.1-mistral-7b,-5.424317261884396,10.676262103383106,-17.200354661215357,16.59786257534359
wizardlm-13b,-11.840839673268565,5.383271532647276,-8.745545358038825,8.852829810193683
zephyr-7b-beta,-2.385059053790232,4.3991212944954,-7.107947054578534,7.581329001531871
phi-3-mini-128k-instruct,-15.041428130799218,3.4198346047652253,-5.447200548378159,5.679913558114524
vicuna-13b,-18.557040226984572,3.717074003659922,-6.245952817464561,6.185899374666274
mpt-30b-chat,-5.205169193945103,8.698730076781294,-15.059612540977142,14.028535611228136
codellama-34b-instruct,-12.54541212482794,5.218761723115515,-8.543706160043675,8.492916046690507
zephyr-7b-alpha,-6.299604669357937,9.975861721336635,-16.361967386905384,16.12892565876138
codellama-70b-instruct,-3.3404426741946995,13.263256486268002,-21.980861610673742,22.81669722632492
pplx-7b-online,-6.194482932288146,6.041166457815509,-9.625700492162316,10.195765795112298
gemma-7b-it,-7.91285966969409,4.89322971266071,-8.028630733988399,8.281622058314728
llama-2-7b-chat,0.5336282878878899,3.9992261592956524,-6.453476585760679,6.324755219531808
qwen-14b-chat,-21.410760375430474,6.383103894819864,-10.558838952393526,10.343163520413938
falcon-180b-chat,-11.648456616665626,12.642711584160175,-21.238507067162015,20.623397951606854
guanaco-33b,-7.0258717692642305,8.04163956639357,-13.745510892131026,13.010106796904527
gemma-1.1-2b-it,-14.652704976620019,4.880545637858369,-8.031968909251624,8.041811360568392
stripedhyena-nous-7b,-8.24349026757969,6.047482911317182,-9.907041796603817,9.875046393592479
olmo-7b-instruct,0.3109708797567762,5.955741915231274,-9.661524637922312,9.932298341792468
mistral-7b-instruct,-3.7725132270841604,5.011974434309948,-8.413178626407575,8.104473115810581
palm-2,-1.0977318784830175,5.306840663629898,-8.15337521372403,8.700835230373514
vicuna-7b,-20.15560340353052,5.714803046147226,-9.206936363317825,9.381348629126332
qwen1.5-4b-chat,-30.190381530036134,5.538129131957643,-8.997884020439258,9.162754992412715
gemma-2b-it,-11.475466651056164,6.835218562595853,-11.20789788538687,10.676142978430745
koala-13b,-6.14698521505711,5.8828445809171654,-9.453732393653874,9.869325047526434
chatglm3-6b,-10.530413031316963,7.023458712387946,-11.246591210937945,11.622539953862914
gpt4all-13b-snoozy,-3.958230157132021,10.841829949546378,-17.016487712752827,18.213234893742225
chatglm2-6b,-7.279342218630064,9.203159474648533,-15.008071699357078,14.934267651184491
mpt-7b-chat,-15.200743717809601,7.231741691119362,-11.920899284903989,11.439772327813792
RWKV-4-Raven-14B,-19.458572125703412,6.958847560976826,-11.506425867460383,11.392289504710046
alpaca-13b,-13.058939920199007,6.163376463044528,-10.072878709015779,10.31940500247604
oasst-pythia-12b,-9.40615151260178,6.589220843657875,-10.90712523970419,11.507581525816553
chatglm-6b,-25.010542428434906,6.987078535429183,-11.605601075372263,10.780181615418062
fastchat-t5-3b,-1.4314460441682606,7.301826636402354,-12.036788862964935,11.694855469051278
stablelm-tuned-alpha-7b,-13.22416671700799,8.328191458792483,-14.138102467269368,13.259305562401387
dolly-v2-12b,-20.217967652617798,8.494181914787605,-13.519124976570406,14.014944066974682
llama-13b,-23.197496841586002,10.300881831908976,-16.654524952770302,16.5602889309283
