,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
gpt4_1106_preview_verbose,-1.5835274219761557,0.8334888457867451,0.1923534665807022
gpt4_1106_preview,0.0000000000000000,0.0000000000000000,0.0000000000000000
gpt4_1106_preview_concise,-1.6424856171097677,0.6835779026468267,-0.3182055425894927
gpt4,-1.6932957655004517,0.9437877797808294,-0.5280788018374943
Qwen1.5-72B-Chat,-1.5274094823561033,1.0054356999126142,-0.6274683163756027
gpt4_0314,-1.6345679540667064,0.9488894918300894,-0.6979541187358145
claude-3-sonnet-20240229,-1.9130324691129728,1.0465239548163705,-0.7406263647309234
gpt4_0613_verbose,-1.5390947908531147,0.9341745245965688,-0.7870463854656012
mistral-large-2402,-1.9411420839744060,0.9823805369873128,-0.8716243812669692
Samba-CoE-v0.2-best-of-16,-0.8896653295215229,0.7246508910780414,-0.8789122374292174
claude-2.1_verbose,-1.2919165999800568,0.6411239328440405,-0.9268504368596158
gpt4_0613,-1.7083898799515389,0.8150202748700113,-0.9832327944321382
Snorkel-Mistral-PairRM-DPO-best-of-16,-0.9713345265812358,0.7572348007200104,-0.9786458417989874
Contextual-KTO-Mistral-PairRM,-0.9925613341335908,0.7127004888182814,-0.9813637802107577
pairrm-Yi-34B-Chat,-1.2420210365145066,0.9025442065871576,-1.1003504646944962
mistral-medium,-1.7911049898127276,1.0462557091295406,-1.1677342709976499
claude-2,-1.8739694447471489,0.9967331116898248,-1.1814326793306524
Samba-CoE-v0.2,-1.0764856591340746,0.7028142773127926,-1.1042286401799457
claude,-1.8570161728445185,0.9761539805888084,-1.2349318654177823
Yi-34B-Chat,-1.2082824487596682,0.7927364384888685,-1.1659911777306440
Snorkel-Mistral-PairRM-DPO,-0.8398702490132689,0.7001202425778436,-1.1803272561718563
claude-instant-1.2,-1.8745125521411767,1.0413876705302201,-1.3903580059479834
claude-2.1,-1.8104339545938817,0.8304070732099538,-1.3132371282938735
xwinlm-70b-v0.1,-1.4799753671173721,1.1319043669461764,-1.5158893232400279
gemini-pro,-1.6042881311507928,0.9568156005175720,-1.4409309534461108
Mixtral-8x7B-Instruct-v0.1,-1.7649063282225192,1.1490632542192407,-1.6076252821367818
evo-v2-7b,-1.2120624438804533,0.6925705496978156,-1.3762505005847001
Mixtral-8x7B-Instruct-v0.1_verbose,-1.2258381070540063,0.9504807226818436,-1.5271775094878235
Mixtral-8x7B-Instruct-v0.1_concise,-1.9899832749517885,1.1040604840388315,-1.6424508784976986
Samba-CoE-v0.1,-1.1909565195726599,0.8596944704879288,-1.5018159265303417
gpt-3.5-turbo-16k-0613,-1.7389424015096959,1.0004829021767505,-1.5978043558545931
gpt-3.5-turbo-0613,-1.6678819994481620,0.9858687365368152,-1.6188312140582617
gpt-3.5-turbo-1106_verbose,-1.9026510941639232,1.0983697154184686,-1.7227799086869275
gpt4_0613_concise,-2.0802330782743521,0.9108755772565552,-1.6338318897318398
pairrm-tulu-2-70b,-1.4667382639578730,1.0771575229056798,-1.7590021518877734
tulu-2-dpo-70b,-1.8331943134229896,1.1394138545027557,-1.8207621588773133
Mistral-7B-ReMax-v0.1,-1.3830363903239853,1.0474672118935917,-1.8162638804920599
gpt-3.5-turbo-1106,-2.0421297235342082,1.0101354535205684,-1.9018508475471767
LMCocktail-10.7B-v1,-1.9708176570567768,1.1761604020236345,-2.0733462954970747
internlm2-chat-20b-ppo,-1.7073550427153950,0.9978618634823327,-1.9432851857917184
claude-2.1_concise,-1.6773711627919807,0.8025811354379702,-1.8374818540583480
gpt-3.5-turbo-0301,-1.9406146538412483,1.0966954780273372,-2.0913971813576766
xwinlm-13b-v0.1,-1.3343474891126346,1.0279127968160426,-2.0482606430232377
deepseek-llm-67b-chat,-1.8249478774870369,1.0156675013083316,-2.0449915150106990
gpt35_turbo_instruct,-1.6276427997503753,0.4535373846585072,-1.6483799650245663
wizardlm-70b,-1.2824030535736828,0.9844520643065112,-2.0439172379898181
vicuna-33b-v1.3,-1.5130882092204874,0.9581865704826056,-2.0211413509514147
pairrm-tulu-2-13b,-1.7120626885014305,1.1509192281920777,-2.2130952906021748
Mistral-7B-Instruct-v0.2,-1.5200257112944473,1.1457181187145653,-2.2399408169803903
evo-7b,-1.4271013961218140,0.6489137682349716,-1.8690609912613640
humpback-llama2-70b,-1.9623944063934609,0.9893492562070384,-2.1812224751798488
OpenHermes-2.5-Mistral-7B,-1.5323319574684169,1.0358782051369773,-2.2260793286135967
deita-7b-v1.0,-1.6070850675887325,1.0877127460961236,-2.2978957076176778
jina-chat,-2.2037670991916318,0.9697951496274804,-2.2021603814711144
gpt-3.5-turbo-1106_concise,-2.1447476270691985,1.0843791219409102,-2.3265767613803403
causallm-14b,-1.5673842748587057,1.0388809414918474,-2.2857797641632418
pairrm-zephyr-7b-beta,-1.4540504323843473,1.1295701914607097,-2.4008463029740081
mistral-orpo-beta,-1.3081705080888339,1.0447843056309580,-2.4039423188036402
Starling-LM-7B-alpha,-1.3393882903514849,0.9790938098438714,-2.3374232603716885
llama-2-70b-chat-hf,-1.3263511461517810,0.9869491424767904,-2.3470779409477869
openchat-v3.1-13b,-1.7199394691358008,1.0744771241413955,-2.4610413873990220
wizardlm-13b-v1.2,-1.5877084745451990,1.0242767662708765,-2.4109964381916540
ultralm-13b-v2.0-best-of-16,-1.2092653887403653,0.9980510246824972,-2.4128574211480647
wizardlm-13b-v1.1,-1.4706613380100313,1.0483637376861501,-2.5016663286839078
zephyr-7b-beta,-1.2414691063648409,1.0472105841184596,-2.5872480742143988
dolphin-2.2.1-mistral-7b,-1.4975356523548369,0.9917391428645558,-2.5326942291693229
humpback-llama-65b,-1.7280938522994302,1.0841268356922471,-2.6829249296422808
openbuddy-llama2-70b-v10.1,-1.7415382282119567,1.1438137431685695,-2.7880029232331189
openbuddy-llama-65b-v8,-1.6359487577879577,1.0801072581163400,-2.7212749183803906
Qwen-14B-Chat,-1.9362606327202221,1.1516389891012428,-2.8246472505882005
gpt4_gamed,-2.9836837676521406,0.6193629220101485,-2.2518120065626768
cut-13b,-1.4473759062007705,1.0840118410757795,-2.7682539024904145
openchat-v2-w-13b,-1.4759301759775996,0.9277768025892892,-2.5921675065339023
tulu-2-dpo-13b,-1.7660266790958090,1.1596031494498773,-2.9526173867535563
claude2-alpaca-13b,-2.0017194014706066,1.2067228591999779,-3.0260622016978802
minotaur-13b,-2.0058522012013440,0.5962471456713685,-2.3101791647190040
airoboros-65b,-1.7110116018708246,1.0761789187803414,-2.9184049929291174
cohere,-1.5895068694092105,0.9008193161467072,-2.7059604900032421
vicuna-13b-v1.3,-1.8324967293629133,1.1145409697243569,-2.9955947660838271
xwinlm-7b-v0.1,-1.5827954654947400,0.9746885649059164,-2.8108445657514447
airoboros-33b,-1.7097680397615098,0.9920324059525856,-2.8466897726392060
platolm-7b,-1.7903676071400756,0.6747307203010036,-2.4933906128699448
vicuna-13b-v1.5,-1.7473535508849021,1.1135907865888532,-3.0486680858737603
gemma-7b-it,-1.5080757076915654,0.8463777565080071,-2.7012308600636188
openchat-v2-13b,-1.4653115768932845,0.9762368626758560,-2.8711344579953697
zephyr-7b-alpha,-1.4172601804991387,1.1041043657582297,-3.0651022145961231
openbuddy-llama-30b-v7.1,-2.1250950550625300,1.1717289275050979,-3.1760949040242106
ultralm-13b-best-of-16,-1.1271312291250568,0.8824484959900352,-2.8223571426275078
oasst-sft-llama-33b,-2.0304551531394721,0.9706049178865980,-2.9414053278635981
wizardlm-13b,-1.8367553752308368,1.0170053665302965,-3.0118560659008766
nous-hermes-13b,-1.9270004531054683,1.0742486180513686,-3.1115628180811754
vicuna-13b,-1.7952216139683690,1.0713293781241946,-3.1877513834498394
tulu-2-dpo-7b,-1.7350719210684249,1.0943568124050216,-3.2263527939441508
openbuddy-llama2-13b-v11.1,-2.0359277623302297,1.1468230501245966,-3.3146775987344195
ultralm-13b-v2.0,-1.6315136383948099,1.0432778683600861,-3.1609829033536090
text_davinci_001,-1.9704255453657336,0.4274629152954177,-2.4582030232368326
openbuddy-falcon-40b-v9,-1.9174321705286312,1.1135454233288340,-3.2920117947360961
openchat-13b,-1.4755398422838879,0.8819855231547405,-2.9807879926006722
llama-2-13b-chat-hf,-1.6914462237474519,1.0885638901071810,-3.3498492546756635
guanaco-65b,-1.8058894632052769,0.9651280060379998,-3.1905736489535386
opencoderplus-15b,-1.6300595575712578,0.9590470702208090,-3.1986709738825567
oasst-rlhf-llama-33b,-1.3949927449902535,0.8533722555804852,-3.0753060151793958
openchat8192-13b,-1.4382259937545825,0.9297392542278318,-3.1988553877303265
phi-2-dpo,-1.1355800930736402,0.9027529121423856,-3.1806364255526165
minichat-1.5-3b,-1.5428685270050095,0.9498104092756492,-3.2638033709117185
vicuna-7b-v1.5,-1.8610517599586256,1.0968125007177112,-3.5163474642647694
llama-2-chat-7b-evol70k-neft,-1.4412548405028620,0.9437820455051800,-3.2849750490576688
recycled-wizardlm-7b-v2.0,-1.4532078723647928,0.9650944108300016,-3.3202196632625789
vicuna-7b-v1.3,-1.9052182495947223,1.0035134997736708,-3.4510773290049475
alpaca-farm-ppo-sim-gpt4-20k,-2.1393042861815976,1.0662873576251355,-3.5629624273674967
ultralm-13b,-1.6643781723501994,1.0038350425020854,-3.4610614507524380
baize-v2-13b,-1.7883392680173575,1.1179541156210406,-3.6751245648095567
recycled-wizardlm-7b-v1.0,-1.4211548215176628,0.9632747634976396,-3.4354613391304500
alpaca-7b_verbose,-1.8936896179217435,0.5782373406372456,-2.9194172044997817
alpaca-farm-ppo-human,-1.9642708901057644,1.0484453638582003,-3.6796722791494880
vicuna-7b,-1.7563001929919926,0.9961877912570248,-3.6194254717758958
alpaca-7b,-2.2067864500930003,0.9859274836495522,-3.6907417919074890
phi-2-sft,-1.6670639149320623,1.0425073771850557,-3.7966483039481775
minichat-3b,-2.0333798597578747,0.9491501464196124,-3.6601747344400608
guanaco-33b,-1.4349560590211672,0.8503113217912687,-3.5047289878089538
falcon-40b-instruct,-1.8850338146041621,0.9649151938522180,-3.7157459259200158
gemma-2b-it,-1.7133755852713253,0.8907609790088235,-3.6281135660972095
llama-2-7b-chat-hf,-1.3973281545452172,0.9271777030231356,-3.7097288083680602
openbuddy-falcon-7b-v6,-1.7697934032776010,1.0116606248003970,-3.9977524268026290
alpaca-7b_concise,-2.0990109024239367,0.8953591959249391,-3.8823896757480600
phi-2,-1.5062178060108140,0.4609680776507991,-3.2866319258286127
baize-v2-7b,-1.6299951029568982,1.0759174615705518,-4.2507888631759521
chatglm2-6b,-2.0430718294666166,0.9423205689794754,-3.9972772301073038
pythia-12b-mix-sft,-1.8000276606167609,0.9719087610105596,-4.0935970860674580
falcon-7b-instruct,-1.9554984210635040,0.9197438703191234,-4.0502215536568924
oasst-sft-pythia-12b,-1.8114830706934373,0.8352440378605592,-4.1447992306227759
guanaco-13b,-1.3596147092204240,0.8881208851338480,-4.3434287450074027
guanaco-7b,-1.5767517948647565,0.8962503484379708,-4.4121467026415724
baichuan-13b-chat,-1.5434450958960080,0.8816095189753549,-4.7614489966908522
gpt-4-turbo-2024-04-09,-1.4534783741698640,0.7724941900359951,0.3493152634687561
dbrx-instruct,-1.3323734341570264,1.1044441013459276,-1.4568295217239740
Nanbeige-Plus-Chat-v0.1,-1.5000834241267793,0.6572030760647244,-0.1927630476629165
claude-3-opus-20240229,-1.8843310811133909,0.8954346955719651,-0.3896212886452050
Mixtral-8x22B-Instruct-v0.1,-1.8414212836636803,1.0995359505985356,-1.0235354211704557
Meta-Llama-3-8B-Instruct,-1.5260415796284870,1.0601549714712668,-1.6185347123220444
Meta-Llama-3-70B-Instruct,-1.2842806707166317,0.9032146562678472,-0.7442957089262047
Qwen1.5-14B-Chat,-1.2673261254168109,0.8917765927062211,-1.4423447102225453
Qwen1.5-7B-Chat,-1.4334024171563693,0.9571477402285772,-2.3083932198426003
Qwen1.5-1.8B-Chat,-1.6003884852505712,0.9646855557741587,-4.6744303356917447
Qwen1.5-110B-Chat,-1.4481674391207744,0.9102999775192785,-0.2004892206655888
