,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,mode,avg_length,discrete_win_rate,length_controlled_winrate
gpt4_1106_preview,97.69900497512438,0.5104849118311993,783,16,5,804,minimal,2049,97.69900497512438,89.85849210429464
xwinlm-70b-v0.3,97.636815920398,0.5360407516038015,785,19,0,804,community,2113,97.636815920398,94.01522563893708
mistral-medium,96.83229813664596,0.6145319996386234,779,25,1,805,minimal,1500,96.83229813664596,91.54314285144824
xwinlm-70b-v0.1,95.56803995,0.724941926,765,35,1,801,community,1775,,
pairrm-tulu-2-70b,95.39800995024876,0.7394089832625841,767,37,0,804,community,1607,95.39800995024876,85.58824844769076
gpt4,95.27950310559004,0.716281440286153,761,32,12,805,minimal,1365,95.27950310559004,86.51018625518144
tulu-2-dpo-70b,95.03105590062113,0.7613100978662208,764,39,2,805,community,1418,95.03105590062113,84.25730016896037
Mixtral-8x7B-Instruct-v0.1,94.78260869565216,0.7793245403322182,762,41,2,805,minimal,1465,94.78260869565216,82.59666180688257
gpt4_0314,94.78260869565216,0.7489957601246771,756,35,14,805,verified,1371,94.78260869565216,85.334647371383
Mistral-7B-ReMax-v0.1,94.39601494396015,0.8121535187540114,758,45,0,803,community,1478,94.39601494396015,
Yi-34B-Chat,94.08468244084682,0.8260116588728516,754,46,3,803,verified,2123,94.08468244084682,76.35646640775717
gpt4_0613,93.78109452736318,0.8338571422122372,750,46,8,804,verified,1140,93.78109452736318,81.38159399734118
gpt-3.5-turbo-16k-0613,93.41614906832298,0.847714896903792,746,47,12,805,verified,1328,93.41614906832298,81.73910844041163
pairrm-zephyr-7b-beta,93.40796019900498,0.8756772802125495,751,53,0,804,community,1487,93.40796019900498,84.7091351498575
ultralm-13b-v2.0-best-of-16,92.79503105590062,0.9119066713591256,747,58,0,805,community,1720,92.79503105590062,76.29672881234201
Mistral-7B-Instruct-v0.2,92.77708592777088,0.9140909384741944,745,58,0,803,minimal,1676,92.77708592777088,82.98089782565651
llama-2-70b-chat-hf,92.66169154228857,0.911762258320568,743,57,4,804,minimal,1790,92.66169154228857,74.11120112901445
LMCocktail-10.7B-v1,92.21668742216688,0.9439649347776012,740,62,1,803,community,1203,92.21668742216688,84.7840193355363
xwinlm-13b-v0.1,91.76029963,0.968139439,734,65,2,801,community,1894,,
claude,91.5527950310559,0.9807635016419884,737,68,0,805,verified,1082,91.5527950310559,76.83227965166517
ultralm-13b-best-of-16,91.54228856,0.981927769,736,68,0,804,community,1980,,
claude-2,91.35572139303484,0.9897323784630048,734,69,1,804,minimal,1069,91.35572139303484,74.33550560445303
cut-13b,91.35572139303484,0.9897323784630048,734,69,1,804,community,1637,91.35572139303484,71.40952810665395
pairrm-tulu-2-13b,91.055900621118,1.0064549282620592,733,72,0,805,community,1454,91.055900621118,68.33213332478894
cohere,90.62111801242236,1.022512757790303,728,74,3,805,verified,1983,90.62111801242236,61.87530037843918
zephyr-7b-beta,90.5977584059776,1.0287080531312012,727,75,1,803,community,1444,90.5977584059776,76.29202319983864
deita-7b-v1.0,90.06211180124224,1.055091421627057,725,80,0,805,community,1417,90.06211180124224,71.13305243806445
openchat-v3.1-13b,89.49004975,1.076875475,718,83,3,804,community,1484,,
gpt-3.5-turbo-0301,89.36567164179104,1.0789487022114888,716,83,5,804,verified,827,89.36567164179104,79.17893267677465
evo-v2-7b,89.35242839352429,1.0802118075166327,715,83,5,803,community,1754,89.35242839352429,72.09602817675409
wizardlm-13b-v1.2,89.16562889,1.090425466,714,85,4,803,community,1635,,
vicuna-33b-v1.3,88.99253731,1.095692216,713,86,5,804,verified,1479,,
causallm-14b,88.26086956521739,1.1163330437039891,705,89,11,805,community,1391,88.26086956521739,69.99239868161098
tulu-2-dpo-13b,88.12189054726367,1.1366163670057372,707,94,3,804,community,1614,88.12189054726367,81.235850076993
humpback-llama2-70b,87.93532338,1.154547675,706,96,2,804,community,1822,,
xwinlm-7b-v0.1,87.82771536,1.154308696,703,97,1,801,community,1894,,
openbuddy-llama2-70b-v10.1,87.67123288,1.150841752,701,96,6,803,community,1077,,
openchat-v2-w-13b,87.12686567,1.176919744,699,102,3,804,community,1566,,
claude-2.1,87.0807453416149,1.1796402736033835,700,103,2,805,minimal,1096,87.0807453416149,65.9557674840558
openbuddy-llama-65b-v8,86.53366584,1.20291824,693,107,2,802,community,1162,,
wizardlm-13b-v1.1,86.31840796,1.206321783,692,108,4,804,community,1525,,
ultralm-13b-v2.0,86.28428927680798,1.2155106363016506,692,110,0,802,community,1399,86.28428927680798,63.77774668548318
gpt-3.5-turbo-1106,86.25621890547264,1.2070462114843,691,108,5,804,verified,796,86.25621890547264,75.55853548412969
zephyr-7b-alpha,85.7587064676617,1.2285451680042003,688,113,3,804,community,1302,85.7587064676617,73.46973908236046
openchat-v2-13b,84.9689441,1.257297984,683,120,2,805,community,1564,,
tulu-2-dpo-7b,84.22360248447205,1.2855613371106336,678,127,0,805,community,1663,84.22360248447205,77.85355333126851
humpback-llama-65b,83.70646766,1.307103474,672,130,2,804,community,1269,,
recycled-wizardlm-7b-v2.0,83.47826086956522,1.3097444061303425,672,133,0,805,community,1583,83.47826086956522,51.09808140925867
phi-2-dpo,82.33830845771143,1.3457343734381375,662,142,0,804,verified,1687,82.33830845771143,54.28867357876411
vicuna-13b-v1.3,82.11180124,1.348769958,660,143,2,805,verified,1132,,
platolm-7b,81.94271481942715,1.3525673702528052,656,143,4,803,community,1344,81.94271481942715,53.09897561500652
llama-2-chat-7b-evol70k-neft,81.86335403726707,1.3589241929038658,659,146,0,805,community,1612,81.86335403726707,45.84186320829894
gpt35_turbo_instruct,81.7103620474407,1.3306133328057392,642,134,25,801,community,1018,81.7103620474407,66.88517803643602
openbuddy-llama-30b-v7.1,81.54613466,1.370658001,654,148,0,802,community,968,,
llama-2-13b-chat-hf,81.09452736318407,1.3817573087734825,652,152,0,804,minimal,1513,81.09452736318407,49.81099211276289
openchat-13b,80.86956522,1.384373865,650,153,2,805,community,1632,,
openbuddy-falcon-40b-v9,80.69738481,1.390851798,647,154,2,803,community,1089,,
ultralm-13b,80.63511831,1.393955692,647,155,1,803,community,1087,,
gemini-pro,79.66417910447761,1.394240454839878,631,154,19,804,minimal,1315,79.66417910447761,57.96703555960053
openchat8192-13b,79.539801,1.422243989,639,164,1,804,community,1664,,
evo-7b,79.20298879202988,1.4222487749194896,632,163,8,803,community,1774,79.20298879202988,49.96597750089794
claude2-alpaca-13b,78.92768079800499,1.4409688114828627,633,169,0,802,community,1127,78.92768079800499,49.72428405745508
recycled-wizardlm-7b-v1.0,78.88198757763976,1.4394196218329596,635,170,0,805,community,1494,78.88198757763976,46.27776656706335
opencoderplus-15b,78.69565217,1.440029529,632,170,3,805,community,1628,,
minichat-1.5-3b,78.55361596009975,1.4475674207961344,629,171,2,802,community,1545,78.55361596009975,51.47924234116803
openbuddy-llama2-13b-v11.1,77.48756219,1.47127541,622,180,2,804,community,1057,,
vicuna-7b-v1.3,76.84144819,1.487520321,614,184,3,801,verified,1110,,
wizardlm-13b,75.31094527363184,1.5101858292160824,601,194,9,804,verified,985,75.31094527363184,62.55024525088112
jina-chat,74.12718204,1.541070307,592,205,5,802,community,676,,
airoboros-65b,73.91304348,1.528533306,587,202,16,805,community,1512,,
airoboros-33b,73.29192547,1.552903182,587,212,6,805,community,1514,,
guanaco-65b,71.80124223602485,1.586912361158523,578,227,0,805,verified,1249,71.80124223602485,54.69096685665386
llama-2-7b-chat-hf,71.36645962732919,1.593038654706019,574,230,1,805,minimal,1479,71.36645962732919,29.29429740470164
ghost-7b-alpha,70.44025157232704,1.6193866875778187,560,235,0,795,community,1681,70.44025157232704,
vicuna-13b,70.43478260869566,1.6069688407799696,566,237,2,805,minimal,1037,70.43478260869566,50.00294675412896
openbuddy-falcon-7b-v6,70.3611457,1.612538057,565,238,0,803,community,1152,,
phi-2-sft,68.53233830845771,1.6387859753495355,551,253,0,804,verified,1068,68.53233830845771,44.73886185749778
baize-v2-13b,66.95652174,1.656535823,538,265,2,805,community,930,,
oasst-rlhf-llama-33b,66.52173913043478,1.6608288428292477,534,268,3,805,verified,1079,66.52173913043478,55.80913636693129
minotaur-13b,66.02484472,1.664554533,529,271,5,805,community,881,,
guanaco-33b,65.96273292,1.671085371,531,274,0,805,verified,1311,,
nous-hermes-13b,65.46583851,1.669962276,524,275,6,805,verified,844,,
vicuna-7b,64.40993789,1.685110726,517,285,3,805,verified,1044,,
baize-v2-7b,63.85093168,1.694598186,514,291,0,805,community,1127,,
alpaca-7b-neft,61.64383561643836,1.7170195034517275,495,308,0,803,community,1067,61.64383561643836,31.61170102536985
oasst-sft-llama-33b,54.9689441,1.740266793,436,356,13,805,verified,748,,
guanaco-13b,52.60869565,1.75766903,422,380,3,805,verified,1774,,
text_davinci_003,50.0,0.0,0,0,805,805,minimal,307,,
minichat-3b,48.818407960199,1.758472313521605,390,409,5,804,community,868,48.818407960199,31.963518903280573
chatglm2-6b,47.12858926,1.759314322,375,421,5,801,community,1027,,
guanaco-7b,46.58385093,1.757046491,374,429,2,805,verified,1364,,
falcon-40b-instruct,45.71428571428572,1.7524717060805597,366,435,4,805,verified,662,45.71428571428572,39.14246411706998
falcon-7b-instruct,45.71428571428572,1.7524717060805597,366,435,4,805,verified,478,45.71428571428572,39.14246411706998
alpaca-farm-ppo-sim-gpt4-20k,44.09937888,1.739977258,350,445,10,805,verified,511,,
pythia-12b-mix-sft,41.86335404,1.737637146,336,467,2,805,verified,913,,
alpaca-farm-ppo-human,41.24223602484472,1.7271813123250834,328,469,8,805,minimal,803,41.24223602484472,29.78213586412439
phi-2,30.663329161451813,1.6056202651534168,234,543,22,799,community,626,30.663329161451813,29.81920417817079
alpaca-7b,26.459627329192543,1.535711469748,205,584,16,805,minimal,396,26.459627329192543,26.29495433067113
oasst-sft-pythia-12b,25.96273292,1.526107929,201,588,16,805,verified,726,,
baichuan-13b-chat,21.80124224,1.449524759,173,627,5,805,community,1727,,
text_davinci_001,15.17412935323383,1.235107892276849,112,672,20,804,minimal,296,15.17412935323383,20.57118821914347
