rm_key,run_dir,model_name,domain,RSI_IQR_med,RSI_IQR_iqr,nGMD_med,nGMD_iqr,nGap_med,nGap_iqr,SEI_med,SEI_iqr,RSI_IQR_tasknorm_med,RSI_IQR_tasknorm_iqr,nGMD_tasknorm_med,nGMD_tasknorm_iqr,nGap_tasknorm_med,nGap_tasknorm_iqr,SEI_tasknorm_med,SEI_tasknorm_iqr
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Brainstorming,1.3139364418418311,0.40581183495617434,0.9770296618823873,0.26967708845183336,0.30067660362209664,0.39044786999068715,0.12123819028216043,0.06859044983095802,1.2304029885326924,0.3602230635245831,0.875199763568144,0.26740333154992446,0.2833914333641904,0.37353114321948844,0.09985022330091764,0.07899914250846934
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Chat,1.0838578921107336,0.4378095140578502,0.6885990835954998,0.2403917346620169,0.09142194029050237,0.17014638887399053,0.08735101851126081,0.03884075362349554,1.42994532202979,0.32080068624754565,0.9229873550732355,0.30044585679543656,0.1469804425082885,0.23297314953595938,0.09683981948180176,0.0625089465721651
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Classification,1.0841118419448739,0.4279054705263792,0.7216266703578558,0.2103057334784526,0.12560993671163814,0.21598433393631183,0.10786398873805086,0.043521662224726365,1.155103579925084,0.33528731694773395,0.8691772115583847,0.3108875871214589,0.15476255635764508,0.2882695464947707,0.08142636887147842,0.04117136529903251
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Closed QA,1.0554155106870218,0.29102650992476586,0.7551057234920165,0.12147267066376932,0.1490685526403469,0.24379184077467297,0.09517903172647713,0.030850471015526293,1.3647437178114556,0.41906363565389726,0.976417138069521,0.1570746900111838,0.1927585568708673,0.31524397716530256,0.10403392417741775,0.03297891209098547
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Code,1.0762393970865252,0.45101490543314493,0.8229667625039481,0.2047117829653069,0.24607738928193554,0.3413085770845421,0.12870954625987086,0.11901032498398134,1.2973623281769204,0.665646406486824,0.9819712770922225,0.23551469511598855,0.34013953611364883,0.5389826897715619,0.11663272883508069,0.05477590619826256
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Generation,1.353044716299435,0.5710062020644293,0.9918575105313561,0.2406809553064545,0.19008145085400283,0.45872863164515615,0.11220801641763828,0.04322925990912105,1.258331874328499,0.503266674946973,0.9056091385082394,0.24664164260103738,0.17902706860575585,0.4249810543754615,0.09826868364981117,0.056100771420864004
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Open QA,1.2748281673842274,0.5327867520263165,0.9579552076736282,0.3542811811118751,0.2818843158957156,0.4316353587153146,0.10978725778079201,0.06365937167517421,1.1651560951729207,0.4661717191440846,0.90884436549048,0.28098836283583273,0.2577663592494454,0.41191473575748183,0.09730959259004501,0.06781896509309038
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Reasoning,1.2034682639908074,0.4386983384773413,0.8861438379084032,0.25509966255597505,0.26144135424742276,0.3781313030348834,0.11944216794122542,0.07516119064988591,1.3160187214718708,0.4647721475131479,0.9394233190462247,0.26964271660891703,0.29045874418458834,0.3797978609800914,0.10668686060631466,0.05303504274356849
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Rewrite,1.5077001652908681,0.8394946642092449,1.1016555429867319,0.3208202042679915,0.1423388820356294,0.06281687225429441,0.09818227687407871,0.053004352261560556,1.3195777567797158,0.7938841266288714,0.9794163098167927,0.3154281162808772,0.12641501654549142,0.05902531578524216,0.1030016732426085,0.05285466272554398
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Role Playing,1.4241506698587145,0.38549584822495153,0.9928874181920361,0.1476859368766945,0.3037240016317801,0.3690525964643683,0.09822941072379132,0.0846392511517966,1.3927165105619035,0.8160882075562896,0.9887887670745187,0.4414265160796267,0.29059472202233716,0.34059586981327206,0.12298073971040868,0.06525091303248542
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Summarization,1.3540605156359962,0.39333654935403306,0.8407996841902436,0.20004192768194962,0.13510131176263127,0.2936929831832389,0.08280610315083892,0.01867646395261391,1.3132501507080587,0.4938760350594227,0.915459952554399,0.17066215152697606,0.23641439537532327,0.28303648887325195,0.08434115329459069,0.050596793678236884
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Translation,2.0212502173810583,1.119156919056233,1.2800059223358653,0.4756198226965149,0.264361777340036,0.17649513472749762,0.08974679924498352,0.023906945216890407,1.3324291626460603,0.4477183711178294,0.8714416981496312,0.2903617550262615,0.167086446499442,0.17554216732849684,0.09510775050265852,0.10612359029768237
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Brainstorming,0.8537992978799107,0.5894314051446292,0.7879507245793684,0.39734140457794664,0.3693029253407974,0.6503407417893788,0.193537628107618,0.4042942465041509,0.8730246506103995,0.6269353037474392,0.801662443969847,0.4062885057016037,0.3776186724041978,0.6649847338691979,0.08248231494691005,0.1311237236228241
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Chat,0.94548387830306,0.5898588390859958,0.7639075653774936,0.413849741935808,0.3054015511064812,0.213984116896686,0.13444798222671606,0.10733381666145042,0.9615848699981171,0.5298765564763785,0.838177140151895,0.2499927594108945,0.29861106415968597,0.1911006409795823,0.0832235645124001,0.04634298820378485
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Classification,0.7328354924731448,0.48471008950979655,0.5707786628699895,0.2567245427420928,0.18913951905474868,0.09900438666905348,0.12403654811865916,0.06973910115873852,1.2515307436392018,1.1981489151969318,1.0577438965277761,0.6544685345005855,0.37502387164811346,0.2474711052061342,0.14351098785507665,0.10784421491682653
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Closed QA,0.764465604134278,0.506936654460863,0.721888434308143,0.3839187915025296,0.1846514626703987,0.16627180319163215,0.1434999475979697,0.06573731748094747,0.8751009962451146,0.49934469700946005,0.8263619509163943,0.39484606967223956,0.18738264186102196,0.190335078299888,0.08175791147840372,0.06427353668508262
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Code,0.8728201082707272,0.8183222807464776,0.8618789866882444,0.37439651314208344,0.3804162078163307,0.3816985096404306,0.14595797354641415,0.18335205562312473,0.9473582996501391,0.6528447145639958,0.8562111097331442,0.3209716291826359,0.39807076594270174,0.42582001059203356,0.09196125629324547,0.057282313804986384
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Generation,0.7997289042970277,0.5291632194119296,0.8057842184663888,0.418416272520331,0.28242697675802303,0.5949279384843911,0.1803522599915342,0.1638972667164731,0.7370729217088157,0.5937074904221156,0.791609801761292,0.43293283730691945,0.26564086033947854,0.5584845120974212,0.07025840044881904,0.074985219437225
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Open QA,0.8358470723425108,0.4601326378812134,0.792593132109212,0.32368622526494406,0.2613758551457148,0.4441038650799636,0.17822573223589844,0.16904236212638224,0.9003247191227767,0.5531754888810478,0.8865166096079872,0.4066308812608269,0.3082972784732077,0.4862203840795947,0.09746189748925893,0.0900139036009352
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Reasoning,1.1930482742183646,0.7710374009827905,1.0326484932108198,0.38441301199723477,0.385117981171364,0.42722022439598034,0.14424585874437768,0.21219042644124303,1.2313152017608997,0.8088683413344468,1.0576489095479407,0.44914135409920986,0.4213942677028305,0.4572816129110877,0.12961363223571482,0.07954720606365706
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Rewrite,1.5103378318191059,0.7471011002662572,1.1333351589511897,0.5799120950754425,0.35471674209166,0.29770774016188123,0.11504079062757794,0.06502807609313965,1.3098576158080286,0.5799997389378211,1.110494197647004,0.5473689713574406,0.30236250085653094,0.2455226065967048,0.13849319047856568,0.11737895909500598
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Role Playing,1.1273570203545753,0.6194052102829665,0.8660880236942022,0.18272207335172974,0.14789214371286563,0.20987006521103185,0.10171167179433915,0.07805066077427025,0.9754991714779697,0.6094067045580449,0.9575431417022758,0.3621928335477035,0.18378494808626572,0.27746362386752155,0.09923306361683815,0.06838215818470617
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Summarization,0.8932300789709854,0.6240535543953291,0.7048860619737802,0.3213947044126171,0.07488108360317239,0.1592191431590822,0.11128701667070895,0.10464992636276094,0.8291276850683617,0.6430604077594886,0.7566736093840987,0.24833081755567465,0.06950726466223883,0.17091687612587192,0.06461088523717962,0.034756262766817536
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Translation,1.4921718893110227,1.321625746705724,1.3894808848976818,0.8808463678363899,0.3419471530933309,0.1156208811396825,0.19932649474865816,0.09654948514829442,1.1093094224938729,0.7169288502779141,0.9005561174870929,0.3606444707493003,0.3935759355078601,0.2791899113068671,0.08899864118201306,0.01801480853963111
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Brainstorming,1.3738904101819145,0.23812093476766538,1.064377585829262,0.12998806590679224,0.6602356711228073,0.30310819447001613,0.1389696985026846,0.034972816318563416,1.3681075307416273,0.23459537414777865,1.0621322531077468,0.1294409296081147,0.6574566552275185,0.303402517517108,0.12376626815660585,0.04477530446163719
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Chat,1.3927971589458858,0.15538046305628783,1.0952076010198508,0.14822322077865668,0.733761916316029,0.29553048960826567,0.1372839371764848,0.013290712513841008,1.421409547884449,0.13738807107059925,1.135837079537248,0.16055818560808,0.7417236889504808,0.23939770527391002,0.15044831863534958,0.044839820505403805
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Classification,1.3975238461368789,0.07720255745288274,1.1523196538337692,0.08802464956532896,0.6654125189986566,0.12867092908813804,0.13403861585940335,0.013813617536432954,1.2729669305330003,0.09297647301534928,1.0565616105981195,0.087208847943405,0.5999603988628265,0.10733900624949955,0.12956462593144946,0.030378080135309543
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Closed QA,1.3894959805902718,0.2280814136606062,1.0997842346492248,0.15578894219625017,0.6551338500277675,0.37288310062276736,0.14031281032762744,0.028349419707397017,1.3696895685862438,0.2328176437316143,1.1302015239672187,0.15604994544908068,0.6499369291974033,0.3761444106375195,0.13450139981593023,0.05431934273554906
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Code,1.3299997434084097,0.27206351932322237,1.1089166611355257,0.11855064998784659,0.7058519538549286,0.21937830708671457,0.149992832406176,0.028919736918480377,1.3080295036912377,0.2641016417820865,1.0905984885251478,0.10105543836689423,0.6941919992512644,0.1815569696556183,0.13315234894616523,0.038119031050578456
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Generation,1.3087671644393466,0.28060016022719303,1.0937899074741395,0.11590022475189188,0.6882956871455267,0.2911039095405106,0.1467667398808284,0.026937964375936846,1.3070225292501363,0.28587407310041724,1.097501597655881,0.11125615790306087,0.6896787554090968,0.29356394582307066,0.1309821526614668,0.03759213324955957
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Open QA,1.3774916956607663,0.27587347303620025,1.067214275468962,0.11627483763662783,0.6635368494784214,0.2666451789966429,0.1400848464290474,0.042753504093934414,1.3497124365512363,0.2783048309768481,1.0750388380675904,0.12063407794080572,0.6638473074559997,0.28923200248373265,0.12431144415604223,0.031743967056904854
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Reasoning,1.3904713287407942,0.21507599012467482,1.1060995097199946,0.13236365345610146,0.6608358853692826,0.204654301352868,0.1427953986597844,0.029381399355754956,1.3822313032269737,0.2602222570171264,1.1314028129460199,0.12914805044039768,0.6712644354724994,0.18785204656081933,0.13541488502555166,0.04184601854902578
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Rewrite,1.3894959805902718,0.1695230112388617,1.1516868758803454,0.14715747577764549,0.5810824173688799,0.2913289898829388,0.13870206985517447,0.014119900822445436,1.411476707966841,0.20624039777160363,1.160770417982782,0.15739815125202594,0.5866268402845766,0.350527419458716,0.12922433392304,0.052064975517228024
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Role Playing,1.441714620033621,0.33011783556140384,1.166228785364936,0.12770495928521686,0.7567201112437085,0.08402999450653903,0.14196239704863156,0.010928893081372626,1.3785113272466276,0.2301636758755763,1.1090881147198588,0.11869950909842042,0.7661783556613252,0.07990309570793364,0.14075615860388524,0.02527334311175239
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Summarization,1.1668164951479432,0.2663450718734053,1.078939294047677,0.1426800963270506,0.6551338500277675,0.21547691448462525,0.1517125338353451,0.029127940688415288,1.3745336923249118,0.3074922558291915,1.2495590229017517,0.13196148103325878,0.7563443318029573,0.26349054710771835,0.15629171920438056,0.04153621932169205
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Translation,1.372689981688964,0.11869236724048648,1.0922315387144108,0.12253123752523476,0.690546490569809,0.212475843252249,0.1390477816001372,0.016470392986115745,1.3433498885411062,0.1788395663653517,1.1251826178002795,0.0921763345609874,0.7351477098441728,0.15028675070288666,0.135950763988254,0.0113388272837045
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Brainstorming,1.962154936659186,0.7846468260960582,1.1763292757030641,0.42040627747103554,0.0808958614236682,0.144149540302813,0.0783520576761465,0.028783071798654802,1.9338670949170322,0.9175785363296551,1.184531983705904,0.41714287066536415,0.08380982338506272,0.14934197252125536,0.1181536340518744,0.0508230615760516
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Chat,1.9212767088121196,0.47633892870213135,1.178954387045436,0.23103370352513064,0.17039766555198194,0.19707608793638315,0.07773691293795804,0.027599269519022085,1.6634085838514208,0.46799908933115764,1.0526978078574316,0.2098147568780817,0.1475273906394541,0.16835396142490647,0.10185270591069173,0.0310006974113492
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Classification,0.4762851415602273,1.3545484881402579,0.3551296044213891,0.7547105464082753,0.023020896734926855,0.06981571019143704,0.1062489667934588,0.06713924328279822,1.3297053475989618,2.3575042594974702,1.092184948298563,1.2035126024763194,0.05877992775302944,0.19546262143208998,0.1142224378619775,0.10398238052018721
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Closed QA,1.5714451378682779,0.7597971665363945,1.0860938747226703,0.44534558226720655,0.07228991871902264,0.0808958614236682,0.07982968107098543,0.025421692660209105,1.8189305748193265,0.6632581858231863,1.1808936462915252,0.38760708882958894,0.07784178194082253,0.09226647743656731,0.11396696996986244,0.026962049391394793
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Code,0.9804320226267446,1.075635263796886,0.6655531293968747,0.6915072929746653,0.08347764423506186,0.09380477548063654,0.09339564169746184,0.029710538712752865,1.6288576528536538,2.0685268776624883,1.1057281720504517,1.114462663239674,0.16579661126798986,0.24086047995489845,0.12507172480952478,0.15399938628498944
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Generation,1.4227236905036218,1.3900614485823968,0.905084454299205,0.7723272594337807,0.06669605596100303,0.13737236042290465,0.08787865741934542,0.05487586693615379,1.7047638262122224,1.606549543163561,1.0201618836466615,0.810653565116898,0.0758003336360499,0.15811665702758482,0.10265417084129425,0.1023484192724933
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Open QA,2.1678369673002145,0.8459641678666578,1.2907957841112256,0.36729595710262664,0.08519883277599097,0.13597389473339971,0.06835390142326003,0.020291326663594678,2.347015683368755,1.0930639896367103,1.4726245072948068,0.5489998236128579,0.08963082119231137,0.18112619847997524,0.15299084820629238,0.06345497955184043
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Reasoning,1.4733373910353187,1.5156947652847457,0.9281031099032193,0.660997657295058,0.08950180412831374,0.13242394336773344,0.08299944359189676,0.04012370336745236,1.5818696927350784,1.4386993296788915,1.0757202028988595,0.5556921056673855,0.10695818033472722,0.1603035468748335,0.11418547624665815,0.09326952060659538
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Rewrite,0.2645251638840427,0.16219512641161668,0.19396241005504958,0.07821033292710894,0.03184198800718854,0.05690679613446872,0.12665772927424235,0.044765389969605884,1.0703751532761305,0.7131658531748861,0.7848232372238155,0.6772664324207498,0.19855795391333478,0.24704672856407559,0.06848226536405683,0.13361819046319703
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Role Playing,1.4014777694515281,0.8013309251754162,0.9617462201205554,0.5136313470889287,0.08519883277599097,0.0903623983987783,0.07798714667762863,0.02590643648418045,2.5651138045356845,1.2308537238730668,1.5803477487676616,0.6768895409370137,0.15776629695967614,0.1387977065066977,0.14944680923931541,0.08102041809726584
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Summarization,1.8046661851641723,1.3722175642557335,1.239901195171808,0.6709169027145271,0.08347764423506186,0.1428586488971162,0.08122334193865122,0.024609367695336615,2.3456171187768162,1.1028986547992088,1.611563120590223,0.42189850642864135,0.10850017192243737,0.18456214811548624,0.1727701479389918,0.03928040805420796
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Translation,0.23983014235735273,0.34101047967158005,0.4506047694756009,0.38684758318474344,0.048193279146015094,0.22157613107367094,0.19474409403120907,0.22075580136729434,0.9638621894097716,1.851210141679216,1.108678573925587,1.4278764472520122,0.2202201691035571,0.4337734713148105,0.13498587869493306,0.2209676438067456
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Brainstorming,0.9645943119396468,0.4279027398830013,0.7776386892412357,0.22859977360238182,0.15955695385467844,0.2357091363762295,0.12711406463383645,0.07017055378204395,0.9857941869273315,0.4373071957046055,0.7947296494443399,0.2370211048693951,0.1630637000932428,0.2408895569559269,0.08434208463509307,0.039938842150105014
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Chat,0.522186394433493,0.4587262423322005,0.4704109687508512,0.2911108564646595,0.1015362433620681,0.06346015210129255,0.1528964139846053,0.09940625428290145,0.6922405163049267,0.5610689050176509,0.6457045460478663,0.2980504489936169,0.12507113420758056,0.1049851097172419,0.06291917348852016,0.05595298983159541
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Classification,0.8122899468965448,0.36262944057881463,0.6382278154187138,0.188869500301466,0.2538406084051702,0.29282327326739277,0.18721466937651865,0.16174185225485094,1.1627939655560051,0.500838131415327,0.913914043192844,0.2886093371663484,0.3631136332977693,0.42111760308467105,0.1283494807408666,0.07658974177819589
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Closed QA,0.7107537035344766,0.40977126785406054,0.5874596937376797,0.1398137954231653,0.1341728930141614,0.174062131477831,0.14445400289549803,0.15088826983533654,1.0142718187617972,0.49952887074018526,0.8226871418845688,0.2980832178472169,0.18764028647093248,0.27385339106568524,0.0884450093299688,0.07993266824307099
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Code,0.7760270028386632,0.6237226377955613,0.6740878378759521,0.322534019577316,0.1885673091009836,0.21757766434728876,0.15185972642406742,0.1329480307275578,0.9429773724721331,0.5290517090852278,0.7648986919637579,0.42473822079480417,0.2610931972167465,0.271972080434111,0.07317780091059367,0.06426805346231268
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Generation,0.9573417231280705,0.6514581239148314,0.8030982978818733,0.33251105093074085,0.1631832482604666,0.2734452625364624,0.16748258528532284,0.13965285723299548,0.9626780115887621,0.6689513686000905,0.8029780340773336,0.31836003710815364,0.16144725625769568,0.270107304242004,0.08972053458840767,0.08301490879241774
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Open QA,0.8159162413023329,0.522186394433493,0.7113580859354413,0.316696378105498,0.0870310657389155,0.17768842588361916,0.13050633897224417,0.07045347574775052,1.0299656192007467,0.6883988969402272,0.9347672237190651,0.4439675171636511,0.11849161990805052,0.2543510321975816,0.09997596996877878,0.09295786984536547
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Reasoning,0.8122899468965448,0.5221863944334931,0.7248559595569861,0.29942111447792386,0.2574669028109584,0.25837347641240543,0.1640068817045789,0.09486727685390225,0.7131910489547606,0.4598538861356398,0.6284189862883032,0.2632118795391446,0.2183802049125042,0.25777157098849385,0.057694115522117895,0.04968468207805593
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Rewrite,1.0425596416640919,0.322740202115145,0.7920431364642275,0.23288868517172756,0.25384060840517025,0.4531734790233374,0.13875013917674434,0.06061999860433909,1.4335290301200745,0.6958416956316824,1.1570623423013107,0.43108546593606045,0.30725226823496166,0.5949108987777281,0.1287004234963931,0.10250018224953594
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Role Playing,0.8558054797660025,0.8141030940994387,0.9384044078978435,0.6309752266071373,0.27559837483989913,0.26743921242687574,0.13436325875468003,0.11009228327301901,1.2151131239425685,1.2211200391033015,1.33216729061866,0.8332449458673967,0.3913076161848949,0.3896243597387555,0.2019224846219575,0.23271818174843
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Summarization,1.0226150224322572,0.8413003021428498,0.8118870252959016,0.3414760565450504,0.12329400979679697,0.21032507553571245,0.1300458195403782,0.09899911775296222,0.9704407865938769,0.7983768173396435,0.7704642178828455,0.32405380876213963,0.11700349909287877,0.20049797467846847,0.08681465277078915,0.06913066239793186
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Translation,1.3199711637068852,0.986352078374376,1.104408107362812,0.6963240737114443,0.2901035524630517,0.3263664965209332,0.1124236112409015,0.08431319821247596,1.3710959700835705,0.6649877442056128,0.9691756797691817,0.3361153889226507,0.22114451130380172,0.6125498199678915,0.159756476314302,0.1873795960909811
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Brainstorming,1.1367636946300665,0.9473716246672785,1.0958646953257607,1.0560281560592484,0.8990468293999007,1.3982316826405818,0.2764083107626212,0.4281278065584958,1.1244224723723215,0.9579886624591646,1.0839674911545523,1.0422129784172176,0.8892863692496001,1.383051845313965,0.24163440318882634,0.41174548279281087
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Chat,0.8406458002257077,0.7143844202505159,0.6548866581340843,0.5571974561644981,0.26403845584388663,0.9627944316675057,0.2303283635442811,0.29888730281088743,0.9221800082743549,0.8947850161361678,0.9600706012415173,1.0032827766494272,0.37556871834492234,0.9976466513601627,0.12558440338832955,0.39153488206643766
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Classification,0.5650916484883182,0.6136220811823659,0.5965998719747079,0.3009161010266516,0.25170021024370504,0.11721333320172539,0.15933411555662869,0.1169331980216333,0.8809413531566938,0.8903464541720016,0.9300606369187461,0.44133797890301074,0.3923843581745972,0.1790582952594607,0.09653217622482235,0.07964082976418785
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Closed QA,0.9253684200136214,0.5642690987816394,0.9156806123571825,0.45125990852516096,0.2681512043772805,0.5441166309680094,0.1485436293973581,0.12337836901805499,0.8956048101039087,0.4320085442928605,0.8872402733057626,0.43660218353325064,0.29172189872679183,0.5236022380600355,0.08640009443745467,0.08741141916818373
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Code,1.0314773321751833,0.7386496365975397,1.0623000975726742,0.4742684517092036,0.5938808882220753,0.8554516949459257,0.22991358171060716,0.28010926936828695,1.0986337217241475,0.7280483383928509,1.0807519603096685,0.37950573618800443,0.6041964371490096,0.7807742374716893,0.16172443383283608,0.17852093419453485
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Generation,0.9475772620939484,0.647757894009535,0.9043477052873861,0.5312757161026351,0.24388598803025668,0.7568485488578074,0.20235846052453432,0.19332798901506326,1.1166085734302862,0.8119795635933763,1.0354445426878063,0.5819605395296984,0.26805568247585976,0.8695579005409394,0.15576524777517142,0.1839015504291753
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Open QA,0.8867085837997191,0.48694942635383465,0.9770519599166044,0.6550465983548275,0.5494632040614215,1.5903998578634106,0.3302500469416684,0.5204275398523563,0.8913388734700568,0.6726615192511902,0.9920094463295597,0.6764589241083928,0.654628952256431,1.284638650450489,0.20595950158246934,0.3237792351729969
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Reasoning,1.46331592818154,0.7876427535015944,1.4825915808564534,0.720999090808391,0.6856979992300934,1.2620804276764943,0.19024716784588086,0.40557527407538185,1.17242733359351,0.5817506320534774,1.1941611832333452,0.5727355614915192,0.5781980644877112,1.028895215939003,0.21925041840019255,0.21149642321059348
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Rewrite,1.4382281621278374,0.6765471337432922,1.347336419539833,0.539972465605541,0.7176746190772308,0.7991455970559301,0.2662696710116419,0.3284640246683067,1.359424196946346,0.6830836377832173,1.2366753792147835,0.6189343643270746,0.7688192939083468,0.8396979360806669,0.1927681448055451,0.13019656047637415
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Role Playing,1.0150263380416078,0.611565706915669,0.9776460235936502,0.43414630490564976,0.38659836213902404,0.4717322567802773,0.22980328083987334,0.42094630610529765,1.308681756594526,0.7575285191402548,1.2554200823661152,0.5700907651353224,0.47886806186404796,0.6020854343325119,0.178330045290769,0.28838993445714967
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Summarization,0.7419398354242547,0.5741396952617848,0.6145360253008978,0.2583720023089885,0.19576683018954835,0.2911825961642862,0.11288449620576524,0.1537403622691197,0.8946921544821895,0.7082152974504251,0.7967260580330816,0.43483713913642263,0.2767737944059132,0.3511319541981099,0.08709647146194321,0.04921202910491629
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Translation,2.3014940792872114,1.5291199047158421,1.585595939090377,0.7869058860560277,0.4112748533393873,0.4046944556859571,0.11051207872902935,0.02699449482736027,1.3190484970829273,0.4094188787797439,0.9672800585853538,0.3903581698489207,0.2504130975797658,0.4282974555669119,0.14275349540918647,0.06310069022133558
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Brainstorming,1.4909969966540293,0.5432458528867672,1.0653131294286484,0.3442924255640103,0.3518465381599206,0.39348644096231444,0.12614805664466988,0.11130221227357034,1.3222440393329513,0.4834607164036925,0.9470224699872911,0.27105330716168163,0.3112646212393861,0.3561844278026336,0.11788303853154292,0.06159677376922584
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Chat,1.1822800341395183,0.8134677774896955,0.9558287436893996,0.23924881499743755,0.13317202304547535,0.5970514172404567,0.09717570540031573,0.09328019647364322,1.394273673094783,0.4910927551108114,0.9116851416584185,0.2407702293248557,0.15212502272307374,0.8660679073263439,0.1011801293323964,0.07549926201925261
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Classification,0.7767579824296635,0.2213701135922833,0.6061074359936249,0.06177281813930746,0.1315641221990456,0.1695389568956069,0.10665783928717432,0.043574210727694046,1.1841448212158676,0.3728172834698069,0.9765265902876546,0.1742707815467055,0.1979773058650561,0.28669160152525297,0.10127728333940439,0.040778418854257203
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Closed QA,0.9182059657141155,0.6811824644698249,0.6882472444960497,0.3976158823926673,0.13695531915472178,0.18765148701862433,0.12162830103186084,0.06851226269526234,1.089998439832606,0.962338198199288,0.8826729611178697,0.5879338232519405,0.16505509281099448,0.2606380649141393,0.09653304445004207,0.0913235335564303
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Code,1.12704391094452,0.38192374222843006,0.7815843122579175,0.22352449054338208,0.22397112966739033,0.22926774422033536,0.11987362037097327,0.066845753439573,1.3517670265144506,0.4452794699719502,0.9904304591511448,0.29717529729787895,0.3303905685593401,0.33309581037098723,0.1236202693228583,0.07874710514745376
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Generation,1.2660800429593273,0.5336812074105786,0.9717921514392478,0.31259007906524006,0.2631282443980912,0.38249123664481693,0.10640191039404917,0.08317273793421473,1.172699258907091,0.4950567314075238,0.9049392121599593,0.28575056518466435,0.2494126511498982,0.36390308648126424,0.09753994245475284,0.04202346179400157
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Open QA,1.4054945045850593,0.549523759868048,1.094102961904428,0.4557991669838788,0.4025427060238232,0.4782086282087523,0.12150224419838396,0.08348324066660184,1.2744271894850019,0.5913601690072032,0.9794146200057743,0.4170641887364913,0.4113332789908003,0.48169222858923666,0.12247428563142149,0.09326255640680514
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Reasoning,1.128840976596412,0.5960110108104141,0.8767664757980642,0.310087422120751,0.23910431410437616,0.360264372002994,0.12775179131461878,0.08371749600527637,1.2722056486240656,0.6298572924890402,0.9715861348124305,0.37134075922514187,0.2936985646218756,0.4191099221826716,0.1147140570878113,0.07257603426425482
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Rewrite,1.3523391942501466,0.9356091278166492,1.0420589948016066,0.37360515012086126,0.06318104502441585,0.0738688565330371,0.10175375017468707,0.09907939583890468,1.5036408184542247,0.9757977648886028,1.087497199478332,0.40293034567255703,0.07047933150338825,0.08128030224850705,0.10901025196384101,0.07711328988450383
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Role Playing,1.5672304132553454,0.8465597956452606,1.2359292747998047,0.31927866415282424,0.2693706829783478,0.5475375294106937,0.09675508640709696,0.1539540270209544,1.3870809755129723,0.439458206921695,0.9746326285252572,0.14860020145627728,0.20628755186741227,0.43321834540694965,0.11767572701908602,0.058983425805030576
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Summarization,1.2234233793275735,0.4029210356347477,0.8335021694904308,0.12254595282752578,0.13884696720934503,0.23683433643882829,0.08680254085252237,0.03035681060709783,1.192039619323645,0.3925851394138946,0.8121208288261558,0.11940235361405838,0.13528520766657218,0.23075896457567896,0.07348886203892391,0.03256114705822277
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Translation,1.4647030886947663,0.6883707270773932,1.0970402709670237,0.36879780656051664,0.13355035265639997,0.20694629717578128,0.10787965475292738,0.04586632230438714,0.9985336718445967,0.12851566093823263,0.8055259917971158,0.31766864655588833,0.13707267593277958,0.21079605355868447,0.06844072559830905,0.031327435569616036
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Brainstorming,1.0911197594255089,0.764607369154463,1.1415670600242054,0.3144637604481888,0.41581896300798904,0.5308442086192899,0.189926499955673,0.14551625246169192,1.0207393448160729,0.7667560048766955,1.1436543049499837,0.2968607327951307,0.41590437838460464,0.5323359426459198,0.15237770795594618,0.0891965529101405
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Chat,0.9941853388281919,0.5436697935172636,1.1048078836587263,0.41814206002193377,0.289453200223743,0.34696582302939344,0.201150949541686,0.11086617216154565,0.9424183880069836,0.6571381217520151,1.0920475693457885,0.3922563825609626,0.3464624941976371,0.3728842793602989,0.12995958855628098,0.07907709386495254
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Classification,1.2322011933032195,0.31922205780272395,1.1633630540087152,0.4036139142214441,0.42783451096503805,0.5286144975605469,0.15368025813776437,0.09416493311532628,1.120163940037913,0.2836574625769732,1.006260702532469,0.31807189227836064,0.38796170047494277,0.4547723526842089,0.1138825253741278,0.07198728169478585
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Closed QA,0.9914852156917764,0.6123879273390382,1.092274812100531,0.2620319497050344,0.3877376823892677,0.8269127105272509,0.2312916408210517,0.236735664529113,1.1707539913959149,0.7291705823318239,1.2785194731747327,0.3057075032906995,0.4514672254402547,0.85974567748231,0.1910989879097068,0.1028030802439468
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Code,1.057908244847598,0.642089281839609,1.0165813601763496,0.356056237588659,0.4426851882153234,0.5486650213196322,0.17921507933920366,0.2772668006992518,1.1725911811821998,0.6322530911597866,1.10576366144676,0.4246443263851608,0.47544478199189943,0.5986500517723438,0.1455411746341827,0.0949284792319256
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Generation,1.0951699441301321,0.599869270500712,1.1362762049007624,0.4445687116115242,0.3593863894569048,0.5661483186279227,0.22309736414754316,0.20143527247123294,1.1303433153004314,0.6693170485475899,1.190796646114587,0.4623069102852774,0.3776233495522705,0.6023462174789008,0.15820193977469488,0.1439723600817053
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Open QA,1.0234816748583002,0.7644048599192317,0.999473079970339,0.34931343008966587,0.43417980033561454,0.5816065235839015,0.20806215673341277,0.18351389984780464,1.1255916459834618,0.8579789055434675,1.0464798552922479,0.4356196247878945,0.4597554586561967,0.6025401406080534,0.14317309340684448,0.08986715353501862
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Reasoning,1.136954349666162,0.7066897278783502,1.0921743610909918,0.31243999070083595,0.3909778301529663,0.6051650979491269,0.19248083370442015,0.17753639730282383,1.086391583164458,0.7863907310019176,1.081042997166942,0.4098616409379354,0.40538197679754184,0.6024732875409219,0.15417325709721308,0.11972721534765321
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Rewrite,1.3441212973076424,0.4439339951659155,1.0082334794795982,0.12552165202379395,0.46793133954080846,0.5734724026354497,0.12767578160272808,0.07407386311702929,1.2750937443399626,0.6739560041176957,1.1614920590914632,0.4005547281317908,0.40938683310933777,0.6136412373769561,0.16874267891459255,0.10147946837556662
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Role Playing,0.8213774580975991,0.493447503179935,1.074484000768371,0.32682887791843407,0.5716160679791641,0.6466119880931049,0.20922249120950576,0.32187164073257524,0.9168011127470075,0.5507738440319382,1.1993123475957959,0.36184145446514293,0.638023654071471,0.7231036017197714,0.16239646461471735,0.13095451408219327
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Summarization,1.038737370579048,0.5348943933239133,1.1895129965506945,0.4870947134673128,0.6311537831371262,1.0944949133460282,0.26663867424426835,0.24795121876735982,1.0700065780232833,0.5509963688755196,1.2253210166530455,0.502936873915384,0.6501534640315634,0.9185518727854156,0.19689115680469893,0.17070394267099132
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Translation,0.9993155727873814,0.5436022904388532,1.0345409292045353,0.24352271275264936,0.35506619243863996,0.28121782465767575,0.2773902273286516,0.358479088166729,1.1611415533158052,0.3859457997937765,1.1673267372900826,0.3672044921570701,0.4761166201880707,0.4560643672237806,0.20384276335053297,0.053071074880534974
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Brainstorming,0.8791681608116965,1.0008724925286452,1.0661943230060185,0.7891101042242521,0.5806540431721949,1.2634893428570675,0.4203702519363913,0.5063784699727516,0.9478112300751826,1.0309373171147067,1.2069173856212785,0.8576013309656789,0.6043234920896204,1.3177244545208762,0.27203943562011634,0.3380628953253293
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Chat,0.5998622300667198,0.7036966502055654,0.6275199199394373,0.6257792873383397,0.23915767125232215,1.060976798651472,0.21097201238670493,0.48765895339237647,1.223885280142196,0.8721877437469033,1.2031025513593143,0.6129427358956013,0.5073292714260699,1.43610392724811,0.16543687755216607,0.340654800505027
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Classification,0.8374454597703105,0.4799881866501715,0.6629623485808831,0.21008626407230602,0.1588611522670132,0.277180434654189,0.1970034463654689,0.13996310958376143,1.124332912079825,0.5982698736422221,0.9888971462513896,0.2986864760171304,0.23997246473401346,0.34552545983897176,0.12859033369615863,0.06625230848648442
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Closed QA,0.8862531477809884,0.6541804634979582,0.9336788259877797,0.45992936397445694,0.29001213327635117,0.5036638514392222,0.1609591264540925,0.23287328622896852,0.9564557109897491,0.385743056028097,1.0549695378219532,0.48275913411038385,0.34358738356080987,0.5967079478887246,0.14012428873253668,0.13549008442154942
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Code,0.8161904988624346,0.9918981757008754,1.0611736038450634,0.7341653742600809,0.7450257408597686,1.148082777335045,0.2812513926735404,0.4533444443899576,0.9365709954073432,1.052330490479688,1.205182145242075,0.6956841447523004,0.7606756898541602,1.3093049562695624,0.310933297497505,0.3548932587160557
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Generation,0.9114442125606932,0.9130973761868615,0.864617696832169,0.49483167169571274,0.4805195606728685,0.8487027168437409,0.2610021476181889,0.3430087818438802,1.0458602066633704,1.095982454351998,1.0958757006430806,0.6142726475372914,0.5790206413213805,1.1416599366351696,0.192598351823773,0.2565262699264948
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Open QA,0.875704389404487,0.6285170662536339,0.9727949515762658,0.5312592196673025,0.5293272486835465,1.106950491874433,0.3293994251656639,0.3891274469891114,1.0274324002399466,0.5720168531190175,1.0673735497070802,0.5391330157532227,0.5543834340417813,1.2799334347395304,0.1924470504653013,0.2858907759525813
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Reasoning,1.3254436177996538,1.0105553080533443,1.198499894484426,0.508136796144681,0.42635877139650324,1.0237018949852525,0.2068962867636575,0.43234407652621676,1.182892481853913,0.814094992115816,1.098815097972742,0.4351159082221938,0.3830790638973615,0.9912948819279029,0.2016621764469787,0.14724164411082222
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Rewrite,1.426483404189501,0.5042936280587149,1.266139652797432,0.5067515062542347,0.44651162322026705,0.5313740226968975,0.1971702231380283,0.2929718353167704,1.2766425110679112,0.537206060660971,1.0327045321874226,0.5533756828146057,0.3643836578157824,0.6697224611197838,0.18043998174976916,0.24512478514411426
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Role Playing,0.9550562434605571,0.5859284223604456,0.829135907152005,0.1957271384754371,0.26306458214384276,0.26109161007808857,0.23526805946759344,0.3634905954044591,1.2026634176910718,0.7408150333714816,1.076860030641592,0.2693933102600514,0.34411257096312137,0.34539658880700286,0.167918659891776,0.15433018162588263
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Summarization,0.7968248678130365,0.3268540655166694,0.6936989463711201,0.28939329027872474,0.09501754746594893,0.2922163514445753,0.11865699137902697,0.08933023272977736,0.838326054365232,0.5475361370304221,0.6120379113769104,0.38654953963195027,0.0902512609289135,0.2982369761933841,0.053499849696628465,0.04744163747354091
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Translation,1.4048348328944422,0.853504763567372,1.2513836367268203,0.476202966760096,0.2739528294792894,0.2547446425847645,0.12727822989054582,0.03375288344236027,1.2732703067535904,0.37703572688901943,0.9888303838621304,0.30260952874054203,0.1306919734397857,0.2321689261106381,0.0984235292395258,0.062081902708726044
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Brainstorming,0.9160759453347702,0.8059646220014929,1.1556060339790455,0.9044958434118617,1.0002694821488025,1.5279124193697664,0.5149220947739277,0.5507131297864336,0.8637747275510548,0.7599499532003535,1.1103966225526078,0.8528557645170254,0.9431614309062272,1.4406798262568679,0.2767811562967575,0.47470156896571347
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Chat,0.8128760317399581,0.858114350028095,0.8225275710969564,0.7675897175508597,0.34557048692326725,1.1206693813427502,0.19795913743186067,0.5382245708516695,1.0891127847825541,0.9873503831993744,1.2327913923830103,0.8803607877601654,0.45883725134462255,1.518270746878148,0.20687320706968348,0.4553570575902483
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Classification,0.9679115183732603,0.42104622281718995,0.857957272534039,0.32713624309826794,0.20828475711829653,0.03800784488987016,0.12512210893374037,0.08017607874516511,1.1710138326344035,0.6168313874395837,0.9534338841255598,0.4700025973688329,0.2348782278851517,0.07703619550991866,0.11932938081762473,0.08631670969524663
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Closed QA,0.9053946757389602,0.6867428040129656,0.9660222251808641,0.5486193275730294,0.33771661222046573,0.5752963219802119,0.2669874824794488,0.17991295342110136,1.0150821769250902,0.9397128221919514,1.0830546827484675,0.6034982907153925,0.37863058299599994,0.6888924188608359,0.187123289795476,0.17510005071457935
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Code,0.6697784346549143,0.8406787481878755,1.0155583582369228,0.7310037512269757,0.9317836947403733,1.3731714530378192,0.48809867636037907,0.491770209299619,0.912648579188692,1.0666008596956398,1.3605981749990361,1.1428709688843868,1.3609328693949472,1.67198366760136,0.4184122217754678,0.4671881846759055
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Generation,0.8620412873794957,0.7778477505654632,0.9133009096064469,0.4566330017483283,0.44099506456230575,0.9744302343765856,0.2667740919687335,0.3721143007500489,1.0135785290769221,0.8967592738260094,1.103841532618744,0.5830161416560898,0.509810609428431,1.0793220665659362,0.17691754668964849,0.3256371138781018
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Open QA,0.9688539833375965,0.7784760605416874,1.0741308104649272,0.7318764039717316,0.616686241663976,1.2270108448186827,0.313885814096714,0.4672396549881924,1.03457046180134,0.743658362079736,1.1649643702991923,0.6928241085547892,0.6888563568685782,1.5789439247151587,0.22570235454533316,0.4310830046347165
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Reasoning,1.32604820482101,1.0998173440068122,1.1859217176278767,0.6810792876995008,0.6088323669611744,1.1392830643883898,0.24707206580177893,0.5501017678139755,1.0940934977003058,0.7529635976154312,1.0501073635771774,0.6204133972727308,0.5548065751879823,1.0735283152051194,0.20288648227515782,0.1697323499901944
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Rewrite,1.450060886378246,0.6191209428218445,1.3805966370797593,0.6431799789947596,0.5815008829954251,0.4146453149344067,0.2048148610080191,0.35547534888497667,1.4796961611473831,0.6700987900447175,1.2820036441349059,1.229122813735545,0.5651351768951338,0.690808713415134,0.20998169457676913,0.4568461500296585
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Role Playing,0.9352393996096059,0.4187685991533774,0.8434363308613037,0.21520489338420945,0.2853312679527795,0.26528425277387857,0.23402254518252985,0.39049529840093344,1.1855029301743683,0.5239364250749292,1.0760557559059702,0.32620245399324366,0.35338558682904736,0.3188162836302924,0.16187486212252844,0.2314192657845356
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Summarization,0.7212998527052923,0.5167849554443406,0.6721520501206499,0.4102056940954619,0.2205368016546669,0.17498432837841804,0.16633516429100137,0.11579986972267298,0.6179205667463001,0.7016879675200063,0.5895246669409757,0.5576309039900681,0.15040056048006337,0.2117755932980277,0.048193385707362646,0.10644996717589528
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Translation,1.9999106543213812,1.3445833491196217,1.3081064643888323,0.7590183976271911,0.36881795604355977,0.32043808787430234,0.13008952556055065,0.02289982832635229,1.2009225717510108,0.4701400403742695,0.841824496215431,0.38176706019618767,0.29167167977366276,0.34540077313690243,0.08940466875991837,0.0878400041732238
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Brainstorming,1.266246563391875,0.6067909715423863,1.1079388929559442,0.4662185730599211,0.4987658219161889,0.8883057952344526,0.18854990225309887,0.14298782443299357,1.1930012780679848,0.5330931251927173,1.0314439947643133,0.4341873458554053,0.47625078932241677,0.8227834648216072,0.12985226075510115,0.1682100958496558
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Chat,1.5076550208018908,0.8033657824420397,1.101843940244113,0.36535860084048055,0.2619487165490062,0.39437297546861444,0.11877357657714249,0.05856594859071429,1.2865986852572049,0.38148784014952253,1.0059546998294313,0.27784281737671845,0.2790688531857338,0.31179934891372807,0.11875352696056551,0.08128582592410288
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Classification,1.0892982721621611,0.6374880867629731,0.8438153254392802,0.38757320994319044,0.12565805590911736,0.17338395214383018,0.10657845716199915,0.0587101932066284,1.1404485075133617,0.6641301850637498,0.881674415301005,0.4012329272846492,0.13129764884196335,0.18161253099381428,0.09234821546793437,0.062160310126644996
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Closed QA,1.0453783651208495,0.3783032433186986,0.7937132031473115,0.2654190805930775,0.15948907096157203,0.31577627799853675,0.09731172574567148,0.054661964386587214,1.1097607751930425,0.4381904388090607,0.9021850024315979,0.21681319797829757,0.19109461793660695,0.3318489457628166,0.08673563555634789,0.0471901503852048
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Code,0.7213255709398372,0.19573658708920205,0.6991004735512107,0.24248497507649158,0.24068350708746325,0.41853798622036786,0.1844471583002073,0.1216643646807376,1.2759660310057068,0.6578329128506695,1.1524586180528347,0.42155116571241213,0.4639453014079343,0.7288365237493538,0.16655855408188525,0.11584411470082434
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Generation,1.1323724038271614,0.6990937610482243,0.9759475904789714,0.34041846642842133,0.2165184963357099,0.329369096546398,0.12397286953472347,0.058027403197348504,1.2085512661915017,0.7476718140608546,1.0356378866737368,0.35549612192975666,0.2310844930660661,0.35246210026432123,0.12568953481907474,0.07843689773675125
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Open QA,1.2145334403831227,0.7247690849719619,0.979052123110273,0.4723168820232456,0.3421765522448273,0.7636143397554055,0.15942932171910895,0.15395910133035395,1.1863266811785074,0.6402728813565748,0.9427102698556074,0.3846553061047646,0.35665530420553965,0.6507649538668657,0.11144744026631692,0.10761544269503442
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Reasoning,1.2484852804893363,0.6480527274010052,1.0041031842562573,0.3586795030445269,0.36875806407175593,0.47290926041181286,0.12698812864348358,0.12347517207322234,1.2468530145337646,0.5739257918515985,0.9745145507456268,0.2339708051185052,0.3214086165164297,0.5347787686908362,0.11778173434817557,0.07529925307949845
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Rewrite,1.4391472153206701,0.5276128035010945,1.4948677026124213,0.6274092635285959,0.31414513977279346,0.5932510139555445,0.185402207284817,0.21036186685975483,1.0021076562789126,0.6874953537348382,1.0382709730777018,0.34854380975821875,0.23623848395190405,0.538515801315681,0.13055579219576846,0.11217336752794363
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Role Playing,1.0452575400670907,0.9302868377412294,0.9761011389847899,0.30705002411595916,0.24841631052802432,0.5983256662134127,0.1519693234102586,0.21839243390920993,1.2728069978624046,0.8176035296305912,0.9339373033000735,0.5517638105621125,0.1969679712355486,0.5150290921022904,0.11687165572449176,0.08058852543150008
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Summarization,1.2130835397380175,0.3775556382985662,0.9211702098568373,0.15364583711173807,0.3460429539651078,0.30157933418188165,0.11233484601789157,0.0960662819437964,1.0579242040093164,0.4096806183064571,0.7758475483176768,0.19753116460217235,0.29145164984021854,0.2742908941637541,0.08379178402765097,0.019767319310671105
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Translation,1.645930799317207,1.0729264773778482,1.330518779488552,0.6554960541502691,0.17688787870283443,0.27451452213991795,0.09588593556805103,0.024204397890823293,1.2249165426087987,0.28699691382527037,0.8521697615182035,0.26022616088240913,0.13181316297463608,0.23348133738815202,0.0933475160790308,0.04799016347505691
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Brainstorming,1.3132577406132264,0.4994872863075672,1.0372281874667237,0.342847245983676,0.5075358966082579,0.6819368000221538,0.14569120357814702,0.10556148336412141,1.2801662576712574,0.489338913711296,0.9829834663876036,0.32906337729267965,0.497223995733184,0.6667321760967695,0.1339081852354177,0.11278691837157978
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Chat,1.355276220859479,0.5406557127921161,0.8689528121092135,0.2661439419950973,0.40354957315120643,0.36946134128945773,0.11954725257772003,0.10425073360730058,1.2744019042322077,0.6374982351634404,1.0637756438696335,0.2559669616441673,0.426377114404838,0.4151837653537588,0.11846540457761501,0.0880403060079809
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Classification,1.3187239091062088,0.4973137311194127,0.986982956258045,0.2778323306653907,0.2802810175299335,0.30696624954291857,0.10351284138581957,0.02515094550450389,1.3845555726352439,0.5106444098697256,1.035688465200909,0.29861784489363163,0.2946051292606551,0.32372367426837495,0.12104184297472437,0.04594979310469227
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Closed QA,1.1126880935478871,0.5422266982251386,0.9120801880853964,0.23397401007800966,0.24481548337074044,0.3532780392944862,0.12115744105969173,0.06069681616064304,1.201129844215324,0.43670485890428334,0.928485555479283,0.2289601626001726,0.25874043079126635,0.3293660075633722,0.08761444984735345,0.03831331491976697
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Code,0.8949021677644928,0.5147667336698409,0.7497306797741389,0.23137004792190397,0.23414139056554642,0.4297183168026499,0.17288494486405415,0.1424011684914951,1.0783975313171428,0.6516550519160915,1.0737306753858238,0.3328642660933656,0.2705113116628748,0.6617468289114536,0.13738944460051028,0.07709607241882577
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Generation,1.223582448840558,0.6113500572781289,0.9764355943934143,0.35034648961277304,0.2373694428251817,0.4361744213219205,0.10733444351276539,0.06799253470499211,1.267259401671851,0.5976496167167578,0.9975185620990932,0.368772547514274,0.24699977015361374,0.4423006755852529,0.11032637870053486,0.07376041651369677
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Open QA,1.2351819166268476,0.45933031619770426,1.0013083348413598,0.4170595696452355,0.3642211364546498,0.7706006354201366,0.14803905737423662,0.11663315378036299,1.2176163674186082,0.44275714376369324,0.9591698197189743,0.40385406802522605,0.37054077703584787,0.6808212098883705,0.12070270326936783,0.10598993617373997
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Reasoning,1.3797125764649183,0.3559896031925798,1.1106046174569315,0.3129965799470178,0.3319298536840981,0.5908626856036436,0.12410208624749924,0.087177355574411,1.3028695557348688,0.35923690838599254,0.9694737260920075,0.2711123191361571,0.29986390211657143,0.5400400752534413,0.10811677262650238,0.06420192740810432
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Rewrite,1.376785809082849,0.7628102693002168,1.3239318334184205,0.33287555343868,0.2740831571914337,0.4837774319773422,0.17963680977926383,0.13280947979692997,1.2592426019367737,0.6413380442901793,1.0254526811411027,0.31402071421110356,0.22248286931457828,0.4769836571390504,0.12277397143705238,0.0861741814524585
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Role Playing,1.205483835838203,0.6974314508684034,0.9161929657791539,0.22702233975813968,0.41181338693587277,0.21227671659361674,0.10419833868197836,0.1434822842666797,1.7086783578082647,0.8243891542627391,0.975465699598979,0.5611532775614265,0.393946164889103,0.170331274125404,0.13229844281966396,0.06952340299182486
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Summarization,1.1469484881968164,0.4552414500021663,0.8160637164317751,0.32266175697421207,0.21658078627313043,0.30507245888393253,0.13678411603762664,0.08392650640012722,1.2536763532487099,0.44543834193916,0.8251199754953447,0.28220676742823925,0.21898428941778353,0.2985326558424835,0.1117457771475523,0.052295180530387064
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Translation,2.101074654751418,0.9381150273468106,1.306945105083273,0.5202400756122892,0.1019203700108849,0.11603771855968992,0.08568826560825893,0.035188841164832674,1.242300819295558,0.39048007888130676,0.841722257015603,0.1360861256893775,0.06026237995927322,0.10519500108750601,0.08395457777936866,0.013063852280161159
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Brainstorming,1.1927745408381056,0.5153242221643206,0.9908187126938339,0.3207723048165414,0.3996126759912704,0.6334900369399186,0.1506000019251098,0.10731983529504346,1.1968734224217417,0.515066987826253,0.9825861114034161,0.3196332339315777,0.40098591542766654,0.6356669786476159,0.12401769517490135,0.09505449162205548
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Chat,1.3576436434327699,0.7731307019878073,0.928547662268412,0.3306806122507645,0.20442613771807766,0.2598637343873868,0.11499916969539459,0.05721052401540816,1.3374790711364413,0.7755959090431899,0.9981798213338691,0.2363610662366511,0.22621480706273303,0.31296988600268055,0.11584908926863968,0.0363960045138767
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Classification,1.2379619568732458,0.9316836721494451,0.9088493495629977,0.6828220389239684,0.20153876289155115,0.1331079795028726,0.11428573687599508,0.05298207549976908,1.403938075159774,0.8003053662562648,1.0511974963327466,0.4481875199186245,0.22412820954882773,0.1746766354391121,0.09765741459709876,0.06370990536902696
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Closed QA,1.2150073270023598,0.5347418178727115,0.8806172401480717,0.23753470239558183,0.10048064396312291,0.20731351254460417,0.09518021602016014,0.0655943695393073,1.1865623394136757,0.5244786005963492,0.8600007830208594,0.2319736892960197,0.09812825430512338,0.21377849725033535,0.08326868155430645,0.031150041668174033
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Code,0.8246342504559743,0.4530291102820111,0.7489208661159455,0.24589204842125023,0.22059543674662616,0.25062413494250196,0.1325787519677889,0.11119383213752498,1.281936329687852,0.6526322626417056,1.1309748177163277,0.42797731192716704,0.33926481314990414,0.27793040644557265,0.130037504617057,0.10125820218439507
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Generation,1.1745118950603253,0.5966218696237081,0.9971288297626388,0.34478137790749663,0.21020088737113068,0.3967253011647439,0.1234263917926749,0.07766375965900804,1.269587674852053,0.5994222958646975,1.0862609524728866,0.39676035159737855,0.22416041948697168,0.43915825403606723,0.12812217894151612,0.07764832911598948
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Open QA,1.1422454813738916,0.5179950438788578,0.8677644119272142,0.4443589653168023,0.3407102295301294,0.5566858665543132,0.1404368476137745,0.09167242363914851,1.1295421981643636,0.5575988257506005,0.9272081948071402,0.2955583113748165,0.3390399724262214,0.45490570682252596,0.10229093058165517,0.08635204310142242
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Reasoning,1.233775263374782,0.41906636388499274,0.9754915646563556,0.31183347358275304,0.2396521106017012,0.4302188491524515,0.11873905294333525,0.07255672614143899,1.2962298060568047,0.4296873643735575,0.9787336430897156,0.27105284489844905,0.24109456934482554,0.43374149797444,0.10451985116980111,0.052282542641983926
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Rewrite,1.3651688640744046,0.9038746433514613,1.3912364449301395,0.36779941974288843,0.18248208903647611,0.5823835025103992,0.16492899539745437,0.1771477795121611,1.1366427943028463,0.6659585920075203,1.093575745987586,0.3097984045300295,0.1546381800377585,0.4719153514304727,0.12211085187272996,0.07864593441818735
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Role Playing,1.21067626476257,0.48998750806155056,0.9619128824860518,0.2536258016892452,0.23849716067109059,0.472951996585044,0.10152384769825396,0.10940750897736162,1.1290504987276604,0.5658042526517362,0.9007462440342486,0.39611225744747003,0.38481247767285937,0.4377241933528776,0.11649685430201229,0.0587185263061758
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Summarization,1.1630345801248825,0.6412859489715402,0.8857302997367125,0.2135694913354116,0.2263701863996792,0.44119087349325237,0.10346094992505539,0.0893200300810627,1.081862227667809,0.5496605024380019,0.8871079513569499,0.17759456149002661,0.20601357105669948,0.4446168595686961,0.08811995096746994,0.030741369616540837
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Translation,2.0061480294706264,0.5965857774383767,1.2200542175845592,0.5585787011628138,0.12935439222838813,0.2136657371629625,0.08156274984178769,0.054222030706212676,1.4011026974821628,0.3524327069166511,0.9784694520927507,0.2775363353244049,0.0714985514266935,0.2563131634230703,0.10091792591139548,0.04131306851604
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Brainstorming,1.239874760988876,0.5205995641226662,1.035820137315158,0.3978988043487961,0.5513639219265355,0.6837912083107034,0.16315425064512534,0.12202476349645014,1.1790310913685365,0.4561276954403364,0.9799552617301497,0.3750729960690701,0.5273774205986292,0.6760871804737464,0.1250984816027777,0.12265231547132016
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Chat,1.455312023534189,0.5958533823941727,0.8934600332706824,0.2138977914851895,0.4086644980240989,0.4086644980240989,0.10484119065274589,0.049837732762703146,1.4028874794357846,0.45973016590620674,1.0156663615522004,0.3387911364003917,0.39805614864225786,0.3680141751598233,0.1056649997622251,0.07629648347298484
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Classification,1.2141076592749753,0.9193910110523482,0.9242745178717502,0.5962980167250944,0.20211124630539673,0.1544985007815292,0.10760594719202987,0.040035883440746545,1.126684859571111,0.5112574400681984,0.8736304023898832,0.32986286325705083,0.1888296899684329,0.1474343832802999,0.08406121280464757,0.051006091380533775
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Closed QA,1.141873016557825,0.614467022631629,0.8704190581428988,0.2848204616833493,0.10713735322052473,0.18434322465217504,0.0926443413503436,0.04372405643080601,1.138358840265587,0.6986579614902827,0.867740296166669,0.28394391122679397,0.10680763219091703,0.20616760741843929,0.07874395936998213,0.045223226530352045
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Code,0.8078897345449237,0.4358023748460118,0.7789318693939482,0.280440150789673,0.17157245908892194,0.35424993171110747,0.15950910941080731,0.12314274965698246,1.1975812451084646,0.5774749653053042,1.0582170036090834,0.31991098127895334,0.25117947231329946,0.5453132957482155,0.12290006624428884,0.08439490834769892
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Generation,1.0736118864955457,0.6292638567040149,0.9439165105587102,0.46474481636708,0.3074006558676518,0.42570375316420006,0.13930921865342272,0.06844738397956729,1.2107177618756848,0.7222817019813939,1.056426071075259,0.5263904634489583,0.33400263570235245,0.48006835778934687,0.12276538783094304,0.09973431718936454
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Open QA,1.1924008281341743,0.5420721488842503,0.9091612885163243,0.47139240085713063,0.40422249261079346,0.844813904543025,0.1401042984972105,0.12699271584364458,1.1219124352381666,0.5517960560822343,0.9517718181294853,0.4075078405417685,0.4234693670778869,0.7843026731935338,0.11543252866022624,0.12055424010081928
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Reasoning,1.2656939174537138,0.6414186410478449,1.0630621236389453,0.43334741194077486,0.3914517270475404,0.6554734238008815,0.12686578975891205,0.09333148419448267,1.1751525425433327,0.6019254725092161,0.9385784241879416,0.3322251729514043,0.3674332202794973,0.5965681885383415,0.10511883862900046,0.08080222200308013
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Rewrite,1.416791507840681,0.920337756835018,1.4325081867162301,0.33180180235788614,0.2642993220916726,0.37118507734933437,0.15834784509023347,0.213590369327438,1.1360572559772337,0.7150711248539583,1.0974427752854368,0.2985501856202313,0.20267376731854259,0.29387080713323177,0.11935838572652158,0.07175900159699589
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Role Playing,1.2798528097086248,0.7459792840969794,1.0235275042790317,0.27750966110728403,0.39589373246084575,0.5069438677934813,0.1551800871478375,0.0955298387265246,1.2391806976430468,0.42737794253223993,1.0028105313258044,0.3282818056502709,0.338909028546027,0.40191671987262656,0.11606499542563098,0.060692954263890875
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Summarization,1.1893469494125268,0.3822900908825979,0.8313336520040356,0.24594520250197316,0.21654776389863933,0.6313200193660331,0.11502751682548396,0.08230579628386725,1.1311653630682528,0.31215102496434755,0.8179201747776351,0.22983806585218725,0.1888119158062644,0.6466239405774779,0.08018857243386801,0.030270847716332594
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Translation,1.5692772249193063,1.1325899193073627,1.3661017481503048,0.837759521814169,0.18101172059219597,0.26763082615165174,0.09715990044645517,0.016642889075300205,1.2014855698101066,0.4443269286115593,0.8206432875104813,0.277193937294842,0.16325493278097078,0.1225717086995845,0.07802169685824745,0.027039867048974764
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Brainstorming,1.1490492039043412,0.5448997996520955,0.8846776137804467,0.3279502535119707,0.159496565937846,0.28557747809646017,0.11532603919382967,0.082365011285914,1.2328839433196372,0.666898017676977,0.9492237767870606,0.3485744900479828,0.16068868599799768,0.3064132381124318,0.10412355862300138,0.08475455886890487
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Chat,0.7433812949990452,0.4228086416737047,0.6513842456938351,0.2751825369348224,0.1778424150715419,0.24205288703947758,0.11155477131987934,0.10861083052183151,1.143699346309637,0.5310951539423405,0.7963902351546658,0.263567902402959,0.31537650867614886,0.28136118870314497,0.08923405254430095,0.049275700052785865
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Classification,0.8446110696831886,0.5895476059621615,0.7956667050265477,0.2891151511374941,0.1297313617311353,0.2710692810774659,0.11930098758811858,0.07187476121469555,1.639570059444846,0.9660344708268644,1.2144030518358768,0.6634692420537727,0.246013681635683,0.30080415928118254,0.11807004155704048,0.07494488143109526
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Closed QA,0.9561837848228085,0.6357398332452171,0.7921267569546192,0.3863886470960054,0.20236594809719663,0.2860454844519116,0.12724698347636754,0.06869805136289431,0.8709721088905522,0.5836890076320024,0.7215352560504343,0.33762102039020825,0.1843318192375678,0.2587768409967124,0.07169078284097874,0.049202511398278914
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Code,0.8678709855491245,0.5636668545056975,0.7902443313915812,0.3148275503077608,0.27574934463198025,0.25160021669068666,0.1701457665140914,0.24855669806891445,1.1461847842814423,0.8809934275459931,1.0475080860338348,0.4545583339020911,0.4366369541780833,0.38682311574873735,0.15390079637003462,0.09525325445813726
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Generation,1.1767551801470657,0.6006159562685877,0.905865301505856,0.29198461510435547,0.1512596540819009,0.32276058303707594,0.104774756753086,0.06494565388521772,1.2361970107968268,0.6274760228573163,0.9344237649629217,0.3073138519360814,0.15000563906411762,0.32531446615439824,0.09363637607624964,0.051012773075412715
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Open QA,1.116850366649283,0.47062719104195405,0.9475984682355829,0.3621511179542376,0.24411211500346383,0.30813538442921895,0.12524282330689074,0.08280489550873427,1.1782351176247325,0.4507909478523402,0.9489001676277591,0.33957975431438947,0.23880965255429817,0.29310006438807973,0.10554158126784563,0.0624680442306752
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Reasoning,1.279248571990928,0.5889625980178472,1.0481418335983976,0.40558470777543776,0.30738657426049665,0.3619561153061329,0.1282947670439898,0.10219412740904757,1.1916003417419847,0.6065542027029338,1.0038044020446875,0.364763160558901,0.2762895963006414,0.31626373519552964,0.11025271760977462,0.07420032054757147
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Rewrite,1.1789080093821422,0.328493660891356,0.9201238951352761,0.4183547811909918,0.10371020836803602,0.2393150498600867,0.1055662182498866,0.0357189456654563,1.2539441848275223,0.2854482859509315,0.971422551362412,0.41022029742291455,0.11020877609933499,0.25420659565265236,0.0956363666568909,0.04132737372997833
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Role Playing,1.119752006053082,0.3674785903004596,0.804606926433324,0.27060647479263067,0.2835182501324739,0.29746483952492636,0.09653538222794156,0.07806821928867441,1.7226586156316201,0.510101048963788,1.2378303825530599,0.3835806997313953,0.41694541029685855,0.4512924936378415,0.14321954596720188,0.056304427252658884
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Summarization,0.9140164121966351,0.3587736720890632,0.8380303803184799,0.26148555093194403,0.13122898206857986,0.20068112521757148,0.12816532534073388,0.06271045005303089,1.0592477108859684,0.6130602404010029,0.8970370342408207,0.2741617759517907,0.15208041891061216,0.23951038869802543,0.08386178540715161,0.03562538137227311
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Translation,1.8128694184766438,0.4395515690399794,1.312882628736037,0.356756044690006,0.3489455386245833,0.23512639297879645,0.10201368843517533,0.03483669914066967,1.2343738328801426,0.30457995948596017,0.8609967439635859,0.12888254904737007,0.21137759999464867,0.20352852403900545,0.08033983862429872,0.0323581403676404
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Brainstorming,1.1930261731575749,0.4628313219363165,0.8999923080288642,0.3653458049061886,0.24295522878077286,0.32568567306388085,0.1134342400081354,0.0584160490772348,1.1862761356128968,0.5456541655786085,0.918196124665003,0.36930316404827046,0.24786939570871344,0.3266024464676483,0.09894299133899076,0.08002002284121135
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Chat,0.7983044818360275,0.41967374109437383,0.6623716399743418,0.18114962132289514,0.12759982837823083,0.20227472794049098,0.10894479026443282,0.059276626788048514,1.317159831862806,0.5251108021068966,0.9203264318353256,0.26082028395069934,0.1742350695120862,0.28542763251679315,0.09994618111396136,0.05400310710471823
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Classification,1.1646706557401902,0.8944876858029513,0.8225113937221755,0.38509028066468365,0.05187770800226052,0.11249568202664104,0.10385545389083112,0.04180005067849976,1.4952142595082203,1.1063488400627925,1.0559472400545498,0.45787543349562376,0.06660105015377603,0.14385718959097676,0.0990719353268874,0.06530886289125512
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Closed QA,1.1722428677777874,0.5880547646218973,0.8116654978160783,0.177571057463224,0.1280831610614817,0.10057347583978611,0.10295785862456097,0.03381062536861845,1.2231344379111417,0.4447010154392258,0.8672220855606958,0.2532084804517284,0.13142650828036598,0.12045378989498684,0.08774261817729889,0.02284027648178577
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Code,0.9241320903756719,0.5996547490199181,0.8640556279686371,0.3042847759221615,0.3051440340257187,0.3247995631445876,0.1399351886234972,0.173299465459138,1.1351920371548054,0.7269702700991365,1.0207696813288176,0.42238588948657496,0.358538295933509,0.37692078221495173,0.13147308585208917,0.06761427300326206
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Generation,1.25473164571927,0.5427826032907319,0.9617335830115408,0.30314967921994107,0.14363036237271817,0.26853158326946497,0.10551110450766266,0.059690532793922224,1.2482987336102478,0.5000632672440977,0.9620083153502541,0.32019206368841824,0.1453681733268461,0.28418777738049333,0.10039675571220558,0.05069711454166356
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Open QA,1.2358816710724858,0.4739882513746909,0.8933778107525235,0.2754347375649694,0.18721085931250536,0.3538196630014422,0.11598127194704666,0.07056115434174604,1.2919232088004948,0.47184980566956636,0.9174218666476557,0.3049666105651644,0.21081757691547534,0.4014226721511826,0.09878420723802073,0.0670401272996699
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Reasoning,1.2565038655578564,0.5628006319220391,1.0016756589190783,0.2656606766370072,0.14612758123618103,0.445164505420407,0.11479952273195448,0.08998884760503018,1.198524201996752,0.5042473751373292,0.9291472714949357,0.24946680130580867,0.13965409435435916,0.40284456997593776,0.10745630087019059,0.049066447376419464
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Rewrite,1.3916759059736843,0.425272344675363,1.0147480178799655,0.2412842626640248,0.1143081795888318,0.13134565667342513,0.10368252250354404,0.06749436161423278,1.3280949366313823,0.5921671899453796,0.9898499337276165,0.2312150872951313,0.1103669727510711,0.11201318451844289,0.100755512457043,0.02717041910217896
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Role Playing,1.165476210212275,0.6706240980105882,0.8882077173976571,0.31481292534204763,0.134688707732577,0.2130590884355261,0.09987658573353386,0.05202291258126851,1.6383173879208663,0.9350235290844844,1.0977161590602087,0.5184730888097389,0.16533478226724338,0.28628212740815706,0.1418198128564101,0.06469064210073561
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Summarization,0.894890463038994,0.44063829623038053,0.7350416039616353,0.08063600265568749,0.13668245505098686,0.11309984788070461,0.09473239818679435,0.030122604778889506,1.033220883483935,0.5087513036868416,0.8475519763604402,0.1509389360892449,0.1455385300140751,0.11346921215684586,0.07847550394787106,0.031894435350827366
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Translation,1.624320037511772,0.5741992277020389,1.0481874790767296,0.34048549883712753,0.17367754418148088,0.06379991418911543,0.09530915233239046,0.01514676715140384,1.2909608084605801,0.22871103881411692,0.8779546603620725,0.1213032301623983,0.1400146614039537,0.03700857322027987,0.08568769458281822,0.006906658171601299
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Brainstorming,1.3312459053573915,0.4638131676719537,0.9838040406982768,0.24747749711915712,0.19536047097190629,0.3311823527420065,0.11669073250830408,0.06739560120648141,1.2859680463547611,0.4998312433990063,0.9503432499745704,0.2549128950722015,0.1775539248499509,0.32336137736408954,0.10252658725848973,0.06707956708864399
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Chat,0.9999123968176936,0.36841706424000487,0.7514048009290835,0.219954079092596,0.160829768743722,0.21784069710635406,0.10003757809873548,0.03350317988655016,1.218171189842578,0.3185022136225242,0.8871710949142488,0.20378617386622555,0.2332078102523063,0.25490916071652614,0.08596260789611654,0.03987251255421742
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Classification,1.0052868251800569,0.4379151410008154,0.8495264906544364,0.3455496180065325,0.2507721919888234,0.17013424784606349,0.12276704339392885,0.0304245621581809,1.278900117182941,0.6303635767252804,0.9640627524863612,0.5256652100675623,0.2862424019315874,0.21036150767172956,0.09343602252161098,0.0669812267776089
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Closed QA,1.119224706462159,0.5391895254540989,0.8755540960996759,0.21923189028140322,0.1322109377141374,0.11010860107391829,0.11219443756664493,0.04243885069976927,1.2450932363403093,0.5655828709442582,0.8808679361942414,0.33899428213338856,0.13442830539103967,0.10695533618248254,0.08556054907846122,0.027407988799619032
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Code,0.9007541935320905,0.5627026495394384,0.8101652176909223,0.31725549202384085,0.3165538305431989,0.5390551647450399,0.16051301685802244,0.17076462078386134,1.1939515077880907,0.9441100477790738,1.0943757664983138,0.36219551712695786,0.4462623366417179,0.6790327847929358,0.1648940184692833,0.10282968490725608
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Generation,1.0866422345153315,0.5389879843905102,0.8605654125557516,0.300527583745985,0.16556598373805467,0.23533698068914152,0.13100179021061592,0.06354742625028117,1.2275267740637053,0.5886295431564166,0.9597269704981772,0.3131467362075049,0.18623541406074925,0.2682651482115601,0.11239395664310392,0.047272164061825384
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Open QA,1.1538897693994024,0.46865015319808057,0.9307614185552856,0.3630426358776414,0.16445750788831726,0.33321455846652515,0.11690578861380341,0.08538846310420134,1.1625919458982918,0.40201471397202826,0.9016638983302845,0.3315525288876203,0.1593162272480418,0.32279758462021535,0.09632072827227867,0.06782926535290723
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Reasoning,1.1549310648946103,0.48702398016191006,0.9784370768197501,0.3851729643322479,0.23879257017525476,0.3728509676389546,0.1285092949113143,0.14216745629930228,1.1231857647286088,0.599725044410964,0.951069085477944,0.2692829508795108,0.216207373389799,0.35032400306574213,0.11006413324297548,0.05992497857603862
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Rewrite,1.3904317976979175,0.4020408316820403,1.0342415579822892,0.2638478099682151,0.07937358887665312,0.21033014340855924,0.11867139567122775,0.05230627726012971,1.423682244257917,0.454473466128954,1.1395190095771368,0.20284439542214872,0.0761949853671358,0.22290929024392236,0.11218740683450978,0.037529681982244156
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Role Playing,0.9915820328560304,0.9663893999074524,0.9722639486868689,0.3508344725628565,0.17030219873238733,0.24557778598273844,0.10910597529063382,0.11721093023718049,1.0956948723172513,1.0614763050549554,1.074348453094449,0.42515177956160743,0.17885293674405534,0.2652196491377039,0.11534658077701432,0.05595489749268773
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Summarization,0.8599085379781294,0.6567551458807964,0.6971697274931928,0.24330111841257063,0.07268914360096375,0.07389838998249551,0.10024251037641163,0.057717651932887803,1.1875490675791498,0.7626441897574826,0.8517164266863321,0.1924769907671824,0.08440885979107982,0.1128632603276998,0.08251699946345614,0.023047780458559597
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Translation,1.5866656132787078,0.7763361769433799,1.2166586139815039,0.3268818769916363,0.17003347731426918,0.2579725613934388,0.09421716705027883,0.029911503479741697,1.2333757048403735,0.30396689090006856,1.033971391364722,0.19467486419325153,0.1828852705963207,0.21415100913220503,0.10101288036577105,0.03520368624021919
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Brainstorming,1.29159557605451,0.47501282877855067,0.9911964868722364,0.2610930741997709,0.21004902544253828,0.319192146505818,0.11736779586132684,0.06528031325064365,1.2408308692131027,0.48155530492564314,0.9522386272982593,0.27174973845935546,0.20047437512796729,0.3066466593240289,0.1066419044475514,0.06818900022336316
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Chat,0.9747373076091647,0.39511182694354585,0.7472375860804779,0.2458589902421351,0.16117487311734638,0.1949131397856495,0.1040717467456228,0.046561192191208356,1.341946413248089,0.45112790065993735,0.9047310549807037,0.21054491990494117,0.1626868194304695,0.22475987674143266,0.09557153250467743,0.0477640008733779
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Classification,0.9945066276508152,0.5002049831371818,0.8554730878980182,0.3944940356922444,0.23846742300241114,0.18046368885242914,0.12242616958980973,0.034283842345429916,1.3864957609672122,0.7746597791225684,1.0226176947103989,0.7191422839041361,0.27007617547506824,0.232101834153877,0.10162705003222022,0.09123583367481736
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Closed QA,1.1076310701113719,0.48481168445891754,0.8904774452788654,0.25620413355675475,0.1436021441914347,0.12163623303404504,0.1129841713344727,0.055918548031521964,1.2490569619936949,0.5743073306078782,0.8678231326252536,0.38242067607056174,0.13539000852283986,0.1579118646251096,0.0783322975414722,0.030021648541462276
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Code,0.9247648597261032,0.6290487807697455,0.8034642170014071,0.3216938197209652,0.25590286498358916,0.5097807475323564,0.15749287777995058,0.1279374862258319,1.2521014533918078,0.9311363988106024,1.1026312005943892,0.3970003191659939,0.5106724032294165,0.7815838019118629,0.1751990500010292,0.11271221607796555
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Generation,1.0951379581406067,0.4854466365783108,0.880022663084771,0.2800405793361407,0.17710015870645385,0.20948271679551186,0.12970733603793622,0.07494585102059573,1.217287961163982,0.5313135586571593,0.9711481005695147,0.3112758747236333,0.19685363794678912,0.23129508953917827,0.10973939972532465,0.045651776992521625
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Open QA,1.1818346637399286,0.4848974887993759,0.916115782207881,0.3379241874094807,0.20153723486904981,0.3557104738049782,0.12080583361125308,0.10313845241378317,1.1097770838979875,0.44261453876630696,0.8836913969720258,0.3345414273445403,0.18950285835629507,0.33208872255575483,0.09480630461661799,0.06199603397426523
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Reasoning,1.1805304377649586,0.49243325500014834,0.996859573342501,0.36498306286386584,0.23681997966560692,0.39314690754704507,0.12883176152754733,0.13894647850484687,1.1519827445341055,0.6250306499906274,0.9598062885868568,0.2602682462980501,0.21337231273740537,0.3886743434026935,0.10642989810005854,0.06317360857655974
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Rewrite,1.3826168204129434,0.3761490677022057,1.0424960418877234,0.2703847308898717,0.050384308717262453,0.20835009950145894,0.11663132180708469,0.049869165934313064,1.394385679909985,0.452112236121889,1.1475800399109823,0.21561254158877996,0.04732757907852961,0.22287328908500956,0.11432308854880435,0.032904258490643795
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Role Playing,1.0411841888602682,0.9158412083184139,0.9557001846060936,0.3071032683167685,0.18931869678775184,0.18852929685553316,0.09862498685431209,0.12953226547921592,1.0906903880320893,0.9755822320565883,1.0189572497764188,0.33457867058904633,0.1983204173386407,0.19749348303864278,0.11846897912452892,0.04532016803579153
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Summarization,0.9110361652527347,0.6236259464527649,0.714235329976997,0.2534698352408088,0.09239411380577012,0.08347046239808056,0.09832736516249518,0.040018412009079296,1.1885078810108034,0.7125289011447521,0.8179986900587731,0.1911720284797842,0.10556564677389502,0.09536985622366742,0.07996621512132795,0.019721802585985104
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Translation,1.6213588173048217,0.7267970854201293,1.2279080193853191,0.3581435035477585,0.20689142571366353,0.21169646877934253,0.1026651983175958,0.034625774569916334,1.2330511778221356,0.2832890483645758,0.9324073632922733,0.2566731367521232,0.1932609051408625,0.1682998640429794,0.08676424864853405,0.028670162634166108
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Brainstorming,1.2550386132688756,0.6509246024855813,0.9626332762694526,0.34999306652740236,0.2123367046289305,0.2712118818214976,0.11709791033073091,0.07916287764431934,1.2473557905707873,0.6702496857724476,0.9543393859714453,0.35628492216339636,0.21125348262168836,0.26982831189406564,0.10782477683786451,0.07072544263475755
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Chat,1.0423801863602042,0.5587109540548734,0.7187275729409557,0.23243652747398702,0.2116932600694489,0.2287445408957115,0.09820491673691845,0.05106690708783723,1.1902778108410503,0.41452881120621443,0.8235981153442465,0.20129794888643915,0.24540295606229065,0.22404099247200976,0.08011495892841958,0.048661078205876285
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Classification,0.9534038683693883,0.9431590245238921,0.7542264986548559,0.37019733740424976,0.07463956889986648,0.22970970773493393,0.11497393792021032,0.09003813030632263,1.1719188397471032,1.1105061458363361,0.9018875326311695,0.5111153865836866,0.0923866986350023,0.2606918058615496,0.08545861460442594,0.06316541453586288
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Closed QA,1.1746080433336747,0.5916472724433381,0.9642999764131097,0.18727141118162427,0.2020415916772248,0.2013981471177432,0.11811975138035535,0.06772404270137233,1.2042107470852128,0.5880447446165469,0.9164220876369221,0.3566088917084187,0.18117202606984678,0.2143312442234721,0.08478060868215731,0.04823481043621081
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Code,0.7753506941753372,0.7552430516915369,0.8271435128486101,0.3542341034546107,0.28955005176672344,0.6096637201088232,0.1884719004557629,0.19493928867132404,0.9525228238775981,0.9432224357679319,1.0429915089670667,0.41861012636788797,0.38807203677732827,0.7272085849985906,0.13912605205334005,0.1523195914763652
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Generation,0.9859179262656932,0.6011280258744499,0.8855305211653206,0.30128788806664186,0.1769472538574421,0.22263181758063622,0.12563256299289133,0.07338973544465333,1.1522392143227638,0.5603527239735787,0.9451892683074241,0.35493718820820985,0.18171438535984685,0.25350818013411147,0.09868280681347952,0.043458040243814
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Open QA,1.2201317459169985,0.5830412014602718,0.9917591964072372,0.34888993891891606,0.2200580393427098,0.355824841393329,0.12945283743371638,0.09686975504391215,1.1568501553684691,0.5546939970490212,0.9092955131375811,0.3203317719150949,0.19944125858446263,0.33809162429249384,0.09686637108832208,0.06488185377169475
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Reasoning,1.1277974516313878,0.7521866900339993,0.9958086521852247,0.4819355066867278,0.27877235539540646,0.4149413102957017,0.15634987643130438,0.12030758678787659,1.1148422920121668,0.7734229701382594,1.0063571029645542,0.4326231781220694,0.2675760268140238,0.4327281715707518,0.11377849423613312,0.09070324995740048
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Rewrite,1.240802402390345,0.2997395995953891,0.94368294031972,0.33443701233806,0.09941218443990837,0.19914609115955756,0.10441492406954744,0.03497520042364849,1.3917378331937171,0.46940914230375874,1.0811724512587504,0.5996631338200851,0.11930487618858941,0.23616434813562143,0.10593144511660224,0.03953495855761335
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Role Playing,0.8310086485704963,0.8521216731784865,0.8219468043577968,0.3686859129442175,0.304992721194282,0.24129170980560283,0.15048292529203755,0.17197915213567933,1.1364707317208385,1.1041242817173653,1.0650256337058845,0.5212534099528956,0.41710191779765965,0.31265022843476287,0.16817990011489814,0.07872240905253736
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Summarization,0.960984449585781,0.3905708476053359,0.7920176956119093,0.19149345755759795,0.18080792121433173,0.09587323936275954,0.1172632289111668,0.04061184124179085,0.9992544348353373,0.38936135755321355,0.8065950308657007,0.18070004360712877,0.17936154339746394,0.15422123282031397,0.07595216765585455,0.03226908123640526
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Translation,1.5243201614119284,0.6973330413381922,1.1576589763567047,0.48510613082060183,0.12225446630150545,0.2142670383073753,0.10737498001936563,0.037105406682402586,1.2949090809748502,0.2771668809584955,0.9921099488897753,0.2715772575395302,0.13269815614864416,0.24693381076293341,0.0981490461052984,0.05606551550076677
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Brainstorming,1.133278591730352,0.6728143118556922,0.8613836238809541,0.2638387056711704,0.1475273906394541,0.19446792402473498,0.1034996260308475,0.05571147011359939,1.14830658224941,0.6823563614045682,0.870148822762781,0.2509520601202291,0.14716162024943896,0.19614419443130163,0.08901580223772176,0.04402438270082082
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Chat,0.8058124897806547,0.3744066353349783,0.6443147023003432,0.16460232011161313,0.09388106677056171,0.1547919969967,0.10294534706522662,0.047606319275829534,1.3232866328778914,0.6280004359420501,1.0580787016006996,0.2612989576128407,0.1541693164517932,0.2577654355223308,0.11341781779397608,0.04449517012178544
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Classification,0.8751056581113075,0.2111625482495596,0.725249867998377,0.17279731609180227,0.21290884785466674,0.16205660335394584,0.14689978979813412,0.08388598110914508,0.7645909866942369,0.3157308318435441,0.6897070573915278,0.25861428985144286,0.20817049952574726,0.21449787935002101,0.05865919913117629,0.036951190625241104
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Closed QA,0.558815873634296,0.5929036419259879,0.5542832559925954,0.17795181042621017,0.18105634305751186,0.27940793681714793,0.16228856429049654,0.25274960113521927,0.8185567469376155,0.8938639676558762,0.7822219224485514,0.2505693153125701,0.26521238600778746,0.4236112935003589,0.07411347019311554,0.04204707765627158
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Code,0.8030184104124832,0.45931172213528915,0.6635084752933655,0.2886672053900051,0.14696857476581981,0.25258477488270176,0.13686799804034577,0.07069990538208293,0.9826706312019597,0.5628661752781713,0.8146583704303252,0.24212597543531145,0.16071516195386554,0.36897700696050534,0.07692549763783674,0.05708181837944104
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Generation,1.0473955771511814,0.7010345134742243,0.8881835018206656,0.22646595378898204,0.15535081287033425,0.24755543201999308,0.12145414117672915,0.07883828262393283,0.9273480960689983,0.6288432492998479,0.7861003060419498,0.22263087510427626,0.1375266387116482,0.21440443608758575,0.07051409827709992,0.03718246300203548
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Open QA,1.0505738424324764,0.5979329847886966,0.7692410953839213,0.29055320896352077,0.1475273906394541,0.21235003198103242,0.10160515875852139,0.03919573765285134,1.4266207430656042,0.8109099436648817,1.0616654610465668,0.3331917120252308,0.2021815052976526,0.22881921197156102,0.11115026761714597,0.05645209179754962
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Reasoning,0.8890760549521648,0.3522111673540662,0.7367172354052475,0.20141043512148338,0.13970396840857396,0.18301219861523188,0.11699819180548371,0.05863317126846526,1.1027637784665738,0.4315119827476964,0.9072304015483984,0.26491198902701774,0.176199257961493,0.24376323233805536,0.09202960306465474,0.05112742718640467
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Rewrite,1.1758883020949673,0.6313658907284735,0.8250227556032809,0.2782429791378729,0.14417449539764834,0.09360165883374456,0.11090157052549154,0.028917104889681994,1.3503822822282017,1.271848460127466,0.9320098093324763,0.6251910142031781,0.13867044300895215,0.10976502169044253,0.09468227166375304,0.11024040390395198
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Role Playing,1.1735133346320215,0.4462144750969854,0.8613991465441106,0.206078876065803,0.109527911232322,0.33864241942238327,0.09523547757807271,0.05822747937740813,0.912648579188692,0.409705242412153,0.7306456569773307,0.2016307610244502,0.08518053405761125,0.282789429010185,0.06487359747292654,0.03149131579777098
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Summarization,1.1192034169131884,0.21346766372830095,0.8561059184077413,0.1448574925765348,0.07376369531972705,0.05588158736342959,0.10371926459017788,0.040346801445382274,1.0584629086967052,0.1574488971669762,0.8093663380533239,0.1690741562409538,0.07923234825176871,0.056977108241656534,0.07160018752452191,0.017215693670957433
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Translation,1.4126865285475,0.7150049103150815,1.0373019654336617,0.16332059163954726,0.18776213354112342,0.26599635584992487,0.12201001197370442,0.08315965518708246,1.2080146227111814,0.25327299067187625,0.8870162750821324,0.7242779311474591,0.13459226039442262,0.3703576239574048,0.10321098558063024,0.09894153466648514
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Brainstorming,1.080621433656706,0.5581470877106082,1.0161836882554303,0.2557883251270292,0.43246134657292734,0.7735632389344154,0.18362375052876334,0.1963146821636687,1.1681397935862772,0.5674316532162393,1.1038509102816783,0.2775583971724864,0.4866361538586229,0.8362132881234771,0.152088658504272,0.11312605961340333
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Chat,1.1886037871060346,0.5980441237439473,1.1615211593963042,0.253101187290504,0.6279466416105052,0.706805675595052,0.1647908632102616,0.2819838080670252,1.2120115524383186,0.5289231306079691,1.201433185404158,0.20009720146506416,0.6301769467806073,0.7000369019162461,0.17756033049940134,0.14446828516931037
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Classification,0.9066794160314327,0.39582447665261944,1.022253816960466,0.2687184011245012,0.47568088459143953,0.5003492670604671,0.20665566514328448,0.24875314993371656,0.9455921465309522,0.32596007931858484,1.0505961468748088,0.3663422666909988,0.5218227167076837,0.4231950633679145,0.15576332293828937,0.06740750577616209
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Closed QA,0.999235718988,0.5079958007098964,1.1490191230542075,0.4610675091725306,0.39575798505566784,0.5606571454955294,0.20730147805404697,0.14683654617846575,0.9130349987518954,0.41368982981979174,1.0697466729666736,0.37009925974489566,0.3924309873318372,0.5386176652639776,0.1445754013051468,0.0830253897811769
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Code,1.0889993748726021,0.5672398135937337,1.07595963391488,0.1911430243588803,0.47342017029508654,0.5441672294515433,0.23917247114523377,0.22359376974221545,1.1497512134375183,0.5773682492640982,1.083695470063485,0.20787073487178076,0.499830787622123,0.5704626419320684,0.1457840148181614,0.0868378723981057
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Generation,1.147578471786924,0.6229182145910437,1.0849932561562707,0.4638860140877552,0.35858918235974746,0.43206239699121796,0.14878959263058245,0.15064265529616142,1.1233219214098857,0.576688311308948,1.0700215238723334,0.43809371459637814,0.34110358832120014,0.4041480165019876,0.13660498055356401,0.12551792304420611
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Open QA,1.1282294170740208,0.5609231118833358,1.0905139056920543,0.42292534337679144,0.32275021160285816,0.46876575850847757,0.1649142153385964,0.11089630180368071,1.1452112112975656,0.522093567074609,1.0860525402922387,0.3419668286856663,0.32053930575206696,0.45999343962117106,0.13654199390419974,0.09072768840583967
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Reasoning,0.9209252406783037,0.8470572321899331,1.087080353505028,0.4409785045699408,0.408790338058173,0.5921907853498054,0.21795963440727084,0.15167369097746808,0.9665756832801984,0.6696222488715338,1.036276218910292,0.33108390943712374,0.42323346355083935,0.6356019418201746,0.14442417310352484,0.09946470741541827
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Rewrite,1.2642712244369068,0.6242148007315991,1.2762766516642712,0.32845767086340993,0.30133991738445676,0.5452643408012441,0.20666208915928858,0.11383089237330335,1.191458642002599,0.5428839411724931,1.2409605082327362,0.24881910608198,0.2755363072056797,0.4860519686031196,0.1672952208061551,0.04392483165057787
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Role Playing,1.3162676532530242,0.2596496860958304,1.0812494098501373,0.3537851644799974,0.5645136581187198,0.5417070403643357,0.19234591569447745,0.06635309444811655,1.2507933559343025,0.29129822465783795,1.0011878970487351,0.219058037761092,0.5360542954004154,0.544486805010907,0.1478162916307445,0.08124883007992312
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Summarization,1.1080159716007474,0.7223647092817161,0.9224896401954694,0.43505082487642965,0.4932346661866505,0.48326092664391695,0.23345821189860405,0.3968314426331161,1.201851370393283,0.8925535066191835,1.022089742261701,0.4513553879310428,0.5350056088320555,0.665545786656732,0.1815554728349733,0.21142902330311242
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Translation,1.2317568335275952,0.3907046236873496,0.9907137126453754,0.22184551925126184,0.45480252314865055,0.5665084060272665,0.21322878313965943,0.12188040715537385,1.2092285418848394,0.2135385323047465,1.239368384294818,0.2884221783622547,0.402997623586645,0.5745996616980309,0.1916693683881736,0.08348751538837085
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Brainstorming,0.8296121436833078,0.5423503210782843,0.8292291279198345,0.3099874245711097,0.28955991718586366,0.4458303486829964,0.17317379931135068,0.14883080192150705,0.9419387395398486,0.755377465642067,0.9542601911958792,0.3817475639547846,0.328765321833853,0.4996711042157368,0.12966629210082603,0.11232398047990683
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Chat,0.6595531447011339,0.32632943047930674,0.7657761831044138,0.40471999007018844,0.2527904038924207,0.464215105329718,0.27273369057482066,0.2496324712151986,0.8194996209814254,0.3849420900055206,0.9184818039699616,0.46254987811780945,0.3157190789039382,0.5350501943655002,0.1238405963035929,0.04951580400388983
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Classification,0.9456659200157373,0.6336995806666819,0.867466868306592,0.5173106655412132,0.38722893687157167,0.4544482033611471,0.156720982930321,0.081400169234706,1.0513369224417384,0.6905290064942027,0.9643997191211544,0.5572643537620605,0.4304988370144178,0.5087668862456398,0.10998574177469,0.1064745942595623
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Closed QA,0.843400711168349,0.7675635900006228,0.8010136333439632,0.38237740386757557,0.1149047290420094,0.39067607874283194,0.14350696838466992,0.05525338084580489,0.9236496594325017,0.8607307453022223,0.8772294827852732,0.44980032612690835,0.14791464023609543,0.4278486160858999,0.115376952081791,0.08854011722148336
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Code,0.9399206835636369,0.7583712116772622,0.877106098354005,0.5472018540822804,0.31483895757510577,0.5147731861082021,0.14770216253192836,0.08924448531311058,1.0291957092830881,0.5227886855458731,0.9729370946189034,0.4473987186949685,0.34541395294995186,0.45249399130246293,0.11565370656221718,0.10484928707028562
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Generation,0.8617854678150705,0.44468130139257633,0.8633813668295427,0.28770867432907565,0.2102756541468772,0.4532991560707271,0.18670038955287538,0.17242974903193675,0.869677890200593,0.43511285485585494,0.8379043585558279,0.3021654423210278,0.20844051854557422,0.4506055750649781,0.09087696746382962,0.0815924327873728
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Open QA,0.891660697365993,0.6250817259885311,0.775862264875879,0.32020117826373284,0.27577134970082257,0.2527904038924207,0.1664696650973898,0.15871630427753303,0.9729379981830533,0.6529191740415661,0.8500881104838839,0.29377819746091693,0.3154541398167461,0.3470919601686129,0.0952782786449593,0.09169033422546913
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Reasoning,1.0502292234439659,0.868679751557591,0.9643698564653532,0.47251378020497414,0.2642808767966216,0.39871940977577264,0.1474512545706873,0.11653054772725868,0.8802818380217582,0.8914345067051783,0.7261814895945558,0.47821068165400704,0.20075129702977118,0.3402914015333146,0.08356319904898873,0.0879770542255697
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Rewrite,1.1352587229350528,0.6710436176053349,0.8545081683090765,0.6768207720377248,0.2619827822157814,0.27921849157208284,0.1584634587806975,0.09385703907612725,1.108551422139837,0.7488924362520547,0.8883873115387315,0.5551663652264738,0.27188921831908597,0.254873880490611,0.09605417624321277,0.12279915383284765
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Role Playing,0.5883122126950882,0.2872618226050234,0.608229032395703,0.17810233001511455,0.42744559203627497,0.40561369351829313,0.2656167412225253,0.1840328115538668,0.6981571019143704,0.26897973030267996,0.776058812438714,0.188219542160178,0.4488561046964104,0.558990329310127,0.093185121036374,0.04800154302286985
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Summarization,0.7813521574856639,0.7193036038029788,0.7713937476353563,0.26223812605809693,0.24359802556905993,0.20223232311393652,0.12011442946380335,0.2544488123008354,0.9360279927430302,1.0351368390334685,0.9240982242080699,0.408365153696714,0.3578930560488056,0.28080839782290906,0.1222994546192061,0.06968261884931803
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Translation,1.3397891406298297,0.19533803937141592,0.9817970737033913,0.5486700811755947,0.19763613395225615,0.10571235071864865,0.1210985514985563,0.030476835972349847,1.2526256961708198,0.17972324312072296,0.9355761075712512,0.4426812966853042,0.1883318354382701,0.10954866548040046,0.10168500039672967,0.050750627854270314
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Brainstorming,1.4018291970880763,0.542252805312111,1.1119652905739588,0.25101657457176907,0.3087729509004573,0.4522336142419006,0.12658127250563744,0.0777968113817899,1.2570104015026058,0.41640084967527624,0.9488485243617119,0.2331257224761456,0.27225173877643827,0.41699316951834214,0.1229489158194702,0.08120146819487939
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Chat,1.0944813521148518,0.7192034579435267,0.7199226086197693,0.39311563497000557,0.1534364048320734,0.3531887522992154,0.1017314476171205,0.061450511089699345,1.3478115436548788,0.6259279718498917,0.9090301826457537,0.2505079206190355,0.25951222788676626,0.44460463320286014,0.0947063994681524,0.04308359795804112
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Classification,1.0265513029167512,0.4303448778908081,0.7805753807913056,0.217296490053962,0.1885890177038178,0.2567565845564572,0.09722538077226944,0.03869301442457207,1.2232886739602167,0.5032982156340391,0.8984704616110291,0.31855335413415387,0.22276529447635596,0.29063633244536385,0.09687687012489055,0.03906406083593636
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Closed QA,1.0389022209527696,0.42503784279720636,0.7906962719597096,0.21550768521821673,0.13966038086882224,0.11780875665125141,0.11396881971842032,0.043914191899306454,1.3115098100933795,0.509062484582288,0.9712478399563047,0.24498799004236071,0.1957013021206725,0.14027145354314097,0.11512544759215504,0.04732262515681962
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Code,1.1353343886955278,0.34196604312225654,0.8324103100380887,0.1730547033257167,0.2793207617376445,0.27837069111948926,0.1217167145979774,0.07621052921785043,1.3027935647716409,0.5178057059533732,1.0393511594398404,0.2252377455248692,0.3291973991054061,0.3923124997148892,0.11833629039220994,0.06745164669911152
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Generation,1.3899533143611356,0.5272669257960525,1.0119901511509966,0.24538377741725492,0.24321807824774486,0.41209313062484115,0.10745787629586284,0.04790776557250276,1.3049978081266123,0.5011548490240565,0.9444803306223908,0.24200935511639865,0.22985284415920154,0.4017154319077364,0.10744516123584535,0.05279658284072339
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Open QA,1.3064658587907427,0.46600963820515173,0.9569190438611224,0.377468334763183,0.2793207617376445,0.4674347441323847,0.1105316360685702,0.0732150810682668,1.2755510181017369,0.41989608791439315,0.909168480464877,0.3457373882402778,0.25786149797232444,0.43598856431274935,0.10571626441731863,0.08458129511908297
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Reasoning,1.1618176071766055,0.486005655746638,0.8679257971042673,0.3316224791527226,0.18336362930396388,0.27955827939218336,0.11796156013333492,0.058264959311440184,1.296906122660434,0.5462278397245242,0.953730851203078,0.3508559067765269,0.19154837280122478,0.3018841051238811,0.10554494754217458,0.058826829961120236
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Rewrite,1.570644870051538,0.7055313479541363,1.091545369996247,0.38446191014682585,0.12208407443295005,0.2946406504553979,0.09922487596837876,0.06547073063339515,1.3159368070829411,0.8147167884857791,1.1313557754291532,0.5871028863284371,0.1309920933409417,0.298763085134912,0.1268935930321689,0.110542244241529
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Role Playing,1.6110822507367708,0.6274622638779099,1.0995549931243058,0.16007865201837768,0.30307252719152583,0.4607842498052979,0.10743095749149945,0.07459069442095062,1.324335967290424,0.8864785549781766,0.9000660170566386,0.595087319977614,0.23123326413007403,0.43727732472246755,0.09366258945470662,0.09499740818466734
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Summarization,1.2778449814188158,0.36126435255353506,0.8754240975038051,0.21383186621119288,0.2650697024653157,0.22136645403017402,0.08715411570558396,0.026736285041160546,1.2311625508117134,0.33314978049671096,0.8585384994383405,0.23801638767469901,0.25481776830598524,0.17937457842829496,0.08988215615583905,0.03427527216289894
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Translation,1.8331612577305614,0.9313067234466874,1.2115841535151128,0.35354172992471067,0.04085303658067589,0.23561751330250283,0.09288817773298375,0.0173966865727172,1.27580818195633,0.47189935009462136,0.7969570119183008,0.25024151253892357,0.023221058973173413,0.2126021553356388,0.0786784052001499,0.03840052360874635
