,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate
gpt-4-turbo-2024-04-09,46.11526538763708,1.474073957743638,370,426,9,805,46.52173913043478,minimal,1802,55.01530093647852
gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65.96273291925466,dev,2402,51.57500797967598
gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0
Nanbeige-Plus-Chat-v0.1,56.70300973017392,1.482841874951873,456,347,2,805,56.77018633540373,community,2587,44.45966240337981
gpt4_1106_preview_concise,22.92019444047205,1.232517714329424,172,622,11,805,22.049689440993788,dev,1136,41.896601591245386
aligner-2b_claude-3-opus-20240229,34.46337362321739,1.314666526302454,225,475,105,805,34.47204968944099,community,1669,41.823071715247664
claude-3-opus-20240229,29.10526953334248,1.3941539442369442,223,579,3,805,27.888198757763977,minimal,1388,40.5095080124761
gpt4,23.576789314782605,1.275704201206918,179,618,8,805,22.732919254658384,verified,1365,38.12808974440021
aligner-2b_qwen1.5-72b-chat,31.773037737123104,1.2392772646245978,180,473,152,805,31.801242236024844,community,1812,36.725868878524274
Qwen1.5-72B-Chat,26.49828339562733,1.304236164893057,201,600,4,805,25.217391304347824,community,1549,36.571754111987296
gpt4_0314,22.073258928708075,1.2466725494608204,172,627,6,805,21.73913043478261,verified,1371,35.30706121640206
Ein-70B-v0.1,24.84472049689441,1.521406431103307,199,604,2,805,24.84472049689441,community,1467,35.029054008520646
claude-3-sonnet-20240229,25.556325292273296,1.3419811051815638,193,608,4,805,24.22360248447205,minimal,1420,34.87247436243302
Meta-Llama-3-70B-Instruct,33.17785695886864,1.3886514096065603,266,537,2,805,33.16770186335404,minimal,1919,34.42459717459881
gpt4_0613_verbose,23.237360043453418,1.283539505582624,171,630,4,805,21.490683229813666,dev,1473,33.82126688658535
mistral-large-2402,21.43877598137888,1.2485232545097724,166,638,1,805,20.6832298136646,verified,1362,32.65207998531868
Samba-CoE-v0.2-best-of-16,26.988254318335404,1.3189030000371738,201,601,3,805,25.15527950310559,community,1578,31.506544268148147
Mixtral-8x22B-Instruct-v0.1,22.21017054750302,1.2780740057417268,174,628,3,805,21.801242236024844,verified,1445,30.878810294279383
claude-2.1_verbose,24.35407109006212,1.293586209982439,191,613,1,805,23.7888198757764,dev,1414,30.29117916664986
gpt4_0613,15.75503808763975,1.0754642482396215,117,684,4,805,14.782608695652174,verified,1140,30.18332231673423
Snorkel-Mistral-PairRM-DPO-best-of-16,34.8601328912795,1.3599450436840308,270,533,2,805,33.66459627329193,community,2616,29.974321613074405
Contextual-KTO-Mistral-PairRM,33.227355200024846,1.3779687477923963,260,544,1,805,32.36024844720497,verified,2521,29.705808939683976
pairrm-Yi-34B-Chat,31.24128294680746,1.34824373994879,239,563,3,805,29.87577639751553,community,2195,28.81484086684313
mistral-medium,21.855772543652176,1.2682402187223842,164,639,2,805,20.496894409937887,verified,1500,28.614337401726104
claude-2,17.188240356708075,1.17482825615589,131,673,1,805,16.335403726708076,verified,1069,28.155196141629148
Samba-CoE-v0.2,21.847378669267083,1.2171089783436106,159,645,1,805,19.81366459627329,community,1469,27.62426735006872
claude,16.98534361236025,1.1687959793014906,129,676,0,805,16.024844720496894,verified,1082,27.289504443727107
Yi-34B-Chat,29.65994671879504,1.3225712597906096,219,582,4,805,27.45341614906832,verified,2123,27.19054787762733
Snorkel-Mistral-PairRM-DPO,30.220052700671644,1.3328273012530358,231,572,1,804,28.79353233830846,community,2736,26.39144645733206
claude-instant-1.2,16.12739962159006,1.1341036838301686,120,682,3,805,15.093167701863356,community,1112,25.61225902543337
dbrx-instruct,18.44834898407453,1.255388020324377,150,655,0,805,18.633540372670808,verified,1450,25.37544974044448
claude-2.1,15.733506736409938,1.120315865445773,115,688,2,805,14.409937888198757,verified,1096,25.251943886133027
Nanbeige2-8B-Chat,39.35450207219922,1.4524224245579649,323,480,2,805,40.24844720496895,community,2709,25.24207090175315
xwinlm-70b-v0.1,21.812957073875776,1.230327447605842,166,635,4,805,20.869565217391305,community,1775,24.649686057119272
gemini-pro,18.177644540571432,1.158850379070738,135,665,5,805,17.080745341614907,minimal,1456,24.38177610802152
Mixtral-8x7B-Instruct-v0.1,18.25531762637268,1.1885585968848205,135,668,2,805,16.8944099378882,minimal,1465,23.68848260134481
evo-v2-7b,20.834113022583853,1.2159901798146158,158,644,3,805,19.81366459627329,community,1754,23.35770570204821
Mixtral-8x7B-Instruct-v0.1_verbose,24.61406305018634,1.2975757385881228,194,609,2,805,24.22360248447205,dev,2083,23.223120780856064
Mixtral-8x7B-Instruct-v0.1_concise,13.744040154795034,1.071868299237546,105,700,0,805,13.043478260869565,dev,910,22.962609472758643
Meta-Llama-3-8B-Instruct,22.56990260938061,1.257580233106669,176,626,3,805,22.049689440993788,minimal,1899,22.918784673210016
Samba-CoE-v0.1,16.835501870062114,1.1180386124646702,124,680,1,805,15.46583850931677,community,1316,22.865837334795227
gpt-3.5-turbo-16k-0613,14.13239070746584,1.027579400264853,96,704,5,805,12.236024844720497,verified,1328,22.720189163383225
gpt-3.5-turbo-0613,14.09579857390062,1.0371186215049395,99,700,6,805,12.670807453416147,community,1331,22.35251298054288
gpt-3.5-turbo-1106_verbose,12.76316981026087,1.044246819212278,94,709,2,805,11.801242236024844,dev,1058,22.00093702171442
gpt4_0613_concise,9.400320574596272,0.901021275896262,71,729,5,805,9.130434782608695,dev,627,21.57799091454269
pairrm-tulu-2-70b,18.638962967441,1.1924966700012911,140,665,0,805,17.391304347826086,community,1607,21.428403975507223
tulu-2-dpo-70b,15.982854374136648,1.1457861368237434,119,683,3,805,14.96894409937888,verified,1418,21.238610038371124
Mistral-7B-ReMax-v0.1,15.999331369031056,1.1288683901451453,120,683,2,805,15.031055900621118,community,1478,20.55136770233589
gpt-3.5-turbo-1106,9.177964561962735,0.8904117511864436,64,737,4,805,8.198757763975156,verified,796,19.30058903498905
LMCocktail-10.7B-v1,13.153430917391304,1.045719535661201,104,700,1,805,12.981366459627331,community,1203,18.950710386651053
internlm2-chat-20b-ppo,21.74915450048448,1.2443662409548863,170,632,3,805,21.30434782608696,community,2373,18.748739485433603
claude-2.1_concise,9.22712524063354,0.8921752289142333,72,730,3,805,9.130434782608695,dev,573,18.2084579084493
gpt-3.5-turbo-0301,9.622453295105588,0.9129656686751644,71,733,1,805,8.881987577639752,verified,827,18.09324155198033
xwinlm-13b-v0.1,17.42793475019876,1.1450161466942668,129,672,4,805,16.273291925465838,community,1894,17.918937898189796
deepseek-llm-67b-chat,12.093422264919258,1.017384363293138,90,713,2,805,11.304347826086955,community,1151,17.843384089909343
gpt35_turbo_instruct,8.462446504415423,0.8724086933609648,66,735,3,804,8.395522388059701,community,1018,17.72780108286588
wizardlm-70b,14.383896086782608,1.0395048912985754,106,697,2,805,13.291925465838508,community,1545,17.575060737493747
vicuna-33b-v1.3,12.705947921540371,0.999255784310268,90,711,4,805,11.428571428571429,verified,1479,17.574575310874923
pairrm-tulu-2-13b,13.831901016757762,1.0835284665170843,110,694,1,805,13.72670807453416,community,1454,17.40520369795085
Conifer-7B-DPO,11.31358564916222,0.9870897936343656,87,717,1,805,10.869565217391305,community,1253,17.11249588276248
Mistral-7B-Instruct-v0.2,14.722772657714286,1.0785266446729775,113,691,1,805,14.09937888198758,minimal,1676,17.111251846021165
evo-7b,15.577437399527952,1.0835570388658722,112,689,4,805,14.161490683229813,community,1774,16.489386004239325
humpback-llama2-70b,10.121771502645965,0.9401806122130112,77,727,1,805,9.627329192546584,community,1107,16.249164231428974
OpenHermes-2.5-Mistral-7B,10.340415705751552,0.935655389929366,75,727,3,805,9.503105590062113,verified,1107,16.248577696674843
deita-7b-v1.0,12.646639472385097,1.0352555320811423,96,708,1,805,11.987577639751551,community,1417,16.05901353966741
jina-chat,7.786130393366459,0.8398450575524877,59,743,3,805,7.515527950310559,community,676,15.866004049505932
TempNet-LLaMA2-Chat-70B-v0.1,15.051894420220444,1.08015075807378,111,691,2,804,13.930348258706468,community,1830,15.831162778430024
gpt-3.5-turbo-1106_concise,7.41586497762733,0.8374438113826953,57,744,4,805,7.329192546583851,dev,431,15.769520983894386
causallm-14b,11.146160869950313,0.9544127300795228,81,720,4,805,10.31055900621118,community,1391,15.72032518895564
pairrm-zephyr-7b-beta,12.84127825562733,1.0535874941903722,98,706,1,805,12.236024844720497,community,1487,15.529867294986612
mistral-orpo-beta,12.565408794559003,0.9929774686147969,95,707,3,805,11.987577639751551,community,1636,14.716749430705242
Starling-LM-7B-alpha,14.24592352162733,1.0685460609395083,102,702,1,805,12.732919254658384,community,1895,14.690471079424972
llama-2-70b-chat-hf,13.88825834374378,1.079984772728814,104,700,0,804,12.935323383084576,verified,1790,14.689648588392544
openchat-v3.1-13b,11.082230489416148,0.9501308701291292,80,720,5,805,10.248447204968944,community,1484,14.50338795683784
wizardlm-13b-v1.2,12.027480342770186,0.971761817748135,82,720,3,805,10.372670807453416,community,1635,14.462590694316631
ultralm-13b-v2.0-best-of-16,13.853373471242236,1.049344706038026,98,705,2,805,12.298136645962732,community,1720,14.198987566645036
wizardlm-13b-v1.1,11.233909572857142,0.95027112458742,79,723,3,805,10.0,community,1525,13.91572059284851
zephyr-7b-beta,10.992885755354038,0.9617876718039866,78,725,2,805,9.813664596273291,community,1444,13.203198493136666
dolphin-2.2.1-mistral-7b,9.039799728223604,0.8892901246776709,68,734,3,805,8.633540372670808,community,1130,13.121477650433736
humpback-llama-65b,9.425139047801242,0.9300866722901956,70,734,1,805,8.75776397515528,community,1232,12.799859995893623
openbuddy-llama2-70b-v10.1,8.096422096285714,0.8498371493561294,57,744,4,805,7.329192546583851,community,1077,12.572173272324846
openbuddy-llama-65b-v8,8.77065015089441,0.8871992619444647,64,738,3,805,8.136645962732919,community,1162,12.469356289070015
Qwen-14B-Chat,7.502333484720497,0.8147265702205473,57,742,6,805,7.453416149068323,community,1013,12.378741790737235
gpt4_gamed,3.7383373713788814,0.6278799633668313,32,771,2,805,4.099378881987578,community,68,12.188764057640531
cut-13b,10.779089202496897,0.9428953578911924,83,721,1,805,10.372670807453416,community,1637,12.154781753927743
openchat-v2-w-13b,9.615344158447204,0.8908241710735803,67,736,2,805,8.4472049689441,community,1566,12.03042777097436
tulu-2-dpo-13b,10.119788388347828,0.929813366016608,75,728,2,805,9.440993788819876,community,1614,11.554479428088396
claude2-alpaca-13b,7.437351324770187,0.82494288683272,59,746,0,805,7.329192546583851,community,1127,11.498898213160734
minotaur-13b,5.738963669079602,0.7271241247374951,42,758,4,804,5.472636815920398,community,881,11.46525131683203
airoboros-65b,9.38895014967702,0.8816208133371506,67,735,3,805,8.509316770186336,community,1512,11.007642406363166
cohere,12.901455209677016,1.0141034031248928,96,709,0,805,11.925465838509316,verified,1983,10.893020886573929
vicuna-13b-v1.3,7.137240386509318,0.7846846271543213,50,751,4,805,6.459627329192546,verified,1132,10.843164943694475
xwinlm-7b-v0.1,11.245651737801245,0.9455447881121428,77,727,1,805,9.627329192546584,community,1894,10.812205627329451
airoboros-33b,9.053160396124223,0.860779211583604,64,740,1,805,8.012422360248447,community,1514,10.719002678100868
platolm-7b,6.320828058468243,0.7405704765061735,42,759,2,803,5.354919053549191,community,1344,10.543402072797148
vicuna-13b-v1.5,6.722122014857143,0.7674173990955542,48,753,4,805,6.211180124223603,community,1061,10.484438298504218
gemma-7b-it,6.937294379677018,0.7869665731853178,50,754,1,805,6.273291925465839,verified,1115,10.425760403690134
openchat-v2-13b,8.435075644708077,0.8235980231340756,56,746,3,805,7.142857142857142,community,1564,10.399607338483346
zephyr-7b-alpha,8.352663968198758,0.8664491644524538,59,745,1,805,7.391304347826087,community,1302,10.289760888704258
openbuddy-llama-30b-v7.1,6.130014613975155,0.7645283386371114,47,755,3,805,6.024844720496894,community,968,10.214494991204496
ultralm-13b-best-of-16,11.307314947751552,0.9418434058669024,80,723,2,805,10.062111801242237,community,1980,9.87608881694948
oasst-sft-llama-33b,4.770390991565217,0.6385940188591437,36,764,5,805,4.782608695652174,verified,748,9.866412143759783
wizardlm-13b,5.878152589354039,0.7044202269956406,42,759,4,805,5.46583850931677,verified,985,9.82815076877079
nous-hermes-13b,5.411878933180125,0.7081240036480951,43,761,1,805,5.403726708074534,verified,844,9.717863417764642
vicuna-13b,5.831103184496894,0.7422829864105622,44,759,2,805,5.590062111801243,verified,1037,9.222060023704104
tulu-2-dpo-7b,8.19751538447205,0.8749615125369641,64,740,1,805,8.012422360248447,community,1663,9.200265611470332
openbuddy-llama2-13b-v11.1,6.174716489490684,0.7535443869648034,42,761,2,805,5.341614906832298,community,1057,9.159089775016035
ultralm-13b-v2.0,7.504622955739131,0.8150376948236479,51,754,0,805,6.3354037267080745,community,1399,9.129018444208118
text_davinci_001,2.764005231108344,0.5177668863975088,23,777,3,803,3.051058530510585,verified,296,9.025728852143091
openbuddy-falcon-40b-v9,5.955742846322981,0.7388621614393269,45,758,2,805,5.714285714285714,community,1089,8.988936477935635
openchat-13b,8.022386010881988,0.8368334957442762,58,746,1,805,7.267080745341616,community,1632,8.806053491170802
TempNet-LLaMA2-Chat-13B-v0.1,7.728405066035775,0.8268032187601844,56,749,0,805,6.956521739130435,community,1540,8.57835531105755
llama-2-13b-chat-hf,7.702309957875775,0.8286143393809762,60,744,1,805,7.515527950310559,verified,1513,8.436014548885215
guanaco-65b,6.858494513378882,0.8048449272409411,54,751,0,805,6.70807453416149,verified,1249,8.252916991586922
opencoderplus-15b,7.40622245099379,0.8024858020878345,52,750,3,805,6.645962732919254,community,1628,8.152410155715494
oasst-rlhf-llama-33b,6.296434785813666,0.7417944201185225,44,759,2,805,5.590062111801243,verified,1079,7.970921837335629
openchat8192-13b,7.472766807962733,0.8038094304604438,51,754,0,805,6.3354037267080745,community,1664,7.897061734563998
phi-2-dpo,7.757095701776398,0.8357079426108714,57,748,0,805,7.080745341614906,verified,1687,7.770894620325308
minichat-1.5-3b,6.553443052819875,0.7674159339313342,48,757,0,805,5.962732919254658,community,1545,7.701632821534051
vicuna-7b-v1.5,4.797493939167703,0.6655960676971918,35,767,3,805,4.53416149068323,community,1083,7.616892731870527
llama-2-chat-7b-evol70k-neft,7.602383512198759,0.8110538775960626,57,748,0,805,7.080745341614906,community,1612,7.533052655504213
recycled-wizardlm-7b-v2.0,7.337129370484472,0.8012012288083948,50,755,0,805,6.211180124223603,community,1583,7.521609955340597
vicuna-7b-v1.3,4.6425118574534165,0.6420919828309861,31,771,3,805,4.037267080745342,verified,1110,7.156460956443475
alpaca-farm-ppo-sim-gpt4-20k,3.450341987080745,0.5834901037598084,26,776,3,805,3.416149068322982,verified,511,7.121808101560879
ultralm-13b,5.074590380484472,0.6707048924298834,38,765,2,805,4.84472049689441,community,1087,7.108191361311167
baize-v2-13b,4.590545330645964,0.6497033226861672,32,770,3,805,4.161490683229814,community,930,7.012247205044542
recycled-wizardlm-7b-v1.0,6.632749960459629,0.7713329913775592,53,752,0,805,6.583850931677018,community,1494,6.9014773220018215
alpaca-7b_verbose,2.9331016025062344,0.5302092824422211,22,778,2,802,2.8678304239401498,dev,537,6.816306816367379
alpaca-farm-ppo-human,4.100426814981367,0.6304721406855217,32,770,3,805,4.161490683229814,verified,803,6.418603294911531
vicuna-7b,4.16261116226087,0.6135107768217068,28,775,2,805,3.602484472049689,verified,1044,6.277217738516609
alpaca-7b,2.591450540223603,0.4870855382635108,17,785,3,805,2.298136645962733,minimal,396,5.875487163278986
phi-2-sft,3.977567775217392,0.6098271417287373,28,777,0,805,3.4782608695652173,verified,1068,5.853787690603355
TempNet-LLaMA2-Chat-7B-v0.1,5.430143264670806,0.7210775889233014,39,765,1,805,4.906832298136646,community,1512,5.739613836715224
minichat-3b,3.0071507063602487,0.504124596172496,22,779,4,805,2.981366459627329,community,868,5.729332875896306
guanaco-33b,5.002493724956522,0.6697115752218856,37,768,0,805,4.596273291925466,verified,1311,5.690019090866207
falcon-40b-instruct,3.3429188224720505,0.5541127159067186,27,777,1,805,3.416149068322982,verified,662,5.6075325447394455
gemma-2b-it,3.4019714381366457,0.5389981250162534,23,782,0,805,2.857142857142857,verified,1041,5.437453620377121
llama-2-7b-chat-hf,4.961339547167702,0.6691754516864777,38,766,1,805,4.782608695652174,verified,1479,5.354821279508294
openbuddy-falcon-7b-v6,3.521174371975156,0.5655836442881659,27,778,0,805,3.354037267080745,community,1152,4.8261244822302976
alpaca-7b_concise,1.9911763835447769,0.4437510223659489,15,787,2,804,1.9900497512437807,dev,351,4.467251679930348
phi-2,2.350209543026152,0.4496590405673333,15,785,3,803,2.054794520547945,community,626,4.398682270855682
baize-v2-7b,3.404814977515528,0.5826293992489878,26,779,0,805,3.229813664596273,community,1127,4.382564905021367
chatglm2-6b,2.7621847964596284,0.5020758950625489,19,781,5,805,2.670807453416149,community,1027,4.35928292679035
pythia-12b-mix-sft,2.5780902809689445,0.5127326717340586,19,786,0,805,2.360248447204969,verified,913,4.221361861408184
falcon-7b-instruct,2.146617553167702,0.454225792894195,16,787,2,805,2.111801242236025,verified,478,4.036937566812824
oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304347826086,verified,726,3.270102114456748
guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614
guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697
baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568
