rm_key,run_dir,model_name,task,RSI_IQR_med,RSI_IQR_iqr,nGMD_med,nGMD_iqr,nGap_med,nGap_iqr,SEI_med,SEI_iqr,RSI_IQR_tasknorm_med,RSI_IQR_tasknorm_iqr,nGMD_tasknorm_med,nGMD_tasknorm_iqr,nGap_tasknorm_med,nGap_tasknorm_iqr,SEI_tasknorm_med,SEI_tasknorm_iqr,DCI_tasknorm
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Analytical Reasoning,1.2248000500585914,0.4491102816770931,0.9001675120825945,0.24644420570902703,0.21052441250229573,0.28520153560417305,0.10913969272206764,0.0684290285346841,1.293981278025306,0.4744776881998245,0.9510122959790657,0.2603642841540079,0.22241560843520183,0.3013107711076705,0.10485809595488338,0.04311745788431326,0.719858582981359
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Casual Conversation,0.7567705057380474,0.21941265669720567,0.5195249273545461,0.1527931502077377,0.053837364837740286,0.056630813013283404,0.08446611563158729,0.041848439559011935,1.390029366002942,0.4030152255793762,0.9542587877272991,0.28064910577576185,0.09888799516530997,0.10401897604652886,0.09885529894434614,0.04791229471595845,0.6934555825457379
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Content Categorization,1.0965553838177478,0.17268588721539335,0.7262965256412133,0.12373000252279409,0.15167153844028483,0.44390431007721703,0.08214574621778326,0.03341654747120748,1.1341320480607235,0.17860347213554695,0.7511851916289184,0.12796997145659694,0.1568690054896404,0.4591159842543179,0.067204653920524,0.020359996456777885,0.8040599753657803
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,ContextBased,1.0782709957596475,0.2252535028824323,0.7609042447048864,0.09034265349540627,0.2382049444235867,0.2575051318182483,0.09319657420985378,0.03247992482704182,1.3942978406707027,0.29127229973502433,0.9839151284986263,0.11682088007974645,0.30801963602193105,0.3329764508808508,0.10424064297858893,0.026774154276154677,0.8501070500265829
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Creative Writing,1.3416169737631223,0.5776088977520766,1.0028620033441018,0.27445628324711224,0.18043135715667202,0.25991765524258104,0.11220801641763828,0.04281145021770985,1.263593858976898,0.5440174583083388,0.9445395322276362,0.2584949958525994,0.16993818603124855,0.24480187671074297,0.09558900500461598,0.07389563437333116,0.6674852479438877
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Critical Thinking,1.1442979526361212,0.6221770936436967,0.9093661394081204,0.35159354536722365,0.5694825030595877,0.2544577338085649,0.16300717568453482,0.04750590929286025,1.1743645139882295,0.6385248689017227,0.9332598401344543,0.36083170652834085,0.5844457218416789,0.2611436402610311,0.12309502146546192,0.06007721587204795,0.6495573484666443
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Data Management,1.0112282395466123,0.5235175830801962,0.9017123735736146,0.26147662505771996,0.206746908719459,0.3959077914247033,0.10766866712362966,0.12842105159671074,1.015433725608999,0.5256947828586014,0.9054624061291192,0.26256405147572937,0.20760672573114672,0.3975542888559591,0.09779314505761844,0.053266358786699136,0.6849111665138973
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Development and Implementation,1.1592809928503982,0.17065428854227105,0.7982489786476271,0.10681412190422745,0.1505922516451886,0.14659254175747916,0.09355284049803259,0.054192052555003256,1.3091200327426264,0.19271164556474152,0.9014240167041232,0.12062002881701861,0.17005655627960076,0.16553987708667717,0.10283386306801412,0.01195966245765917,0.8829894226533698
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Discussion,1.2189592038733648,0.20722306465847207,0.7862286864983203,0.245569489613655,0.1439895559575412,0.34429248763569054,0.08549861619383015,0.032168236868231115,1.4947163644910697,0.25410178196348165,0.9640920550967399,0.30112306759643004,0.17656337055550758,0.4221795231580766,0.09797120542737475,0.06824600154688246,0.6496693417468106
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Documentation,1.0061492428638066,0.08075604725661034,0.8079978306137902,0.1056854559747149,0.26112391695474735,0.022411072862880077,0.1424492861020777,0.007583946046844148,1.2070155325412242,0.09687807656438907,0.9693054372674916,0.12678435910968522,0.3132543466268333,0.02688518948681551,0.11857508346556744,0.023213527962043345,0.8548568664411569
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Factual,1.1204266682269346,0.4395871628968322,0.8433956158281221,0.21965955236928647,0.20341381714636775,0.32704770671110184,0.09815487399889178,0.046862954592680806,1.1221166028698106,0.4402501902918501,0.8446677057464148,0.21999086391132172,0.20372062531702592,0.32754099133660874,0.08857044904798239,0.056085334382016405,0.6913658524887039
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,General Character,1.4000254356153876,0.4138112547315931,0.964924719788367,0.12549354303765725,0.3132471204120407,0.3017241466879253,0.12337847830398879,0.09541875507219202,1.339505604104636,0.39592316014665396,0.9232132765698498,0.1200687501109573,0.2997061786074857,0.2886813161394559,0.10389464366817308,0.040952532117362384,0.8223580229085273
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,General Explanation,1.257559578662688,0.776070693132709,1.001959070600492,0.3550201045876026,0.36568776116200946,0.3868925723127232,0.11470974882681939,0.10829585185502272,1.1103983513723734,0.6852539098937749,0.8847085410621365,0.31347519873160656,0.3228945125153913,0.34161790959527694,0.10796246588778613,0.0936568782258616,0.6046277505473042
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,GeneralExcerpt Language Translation,1.4817972822085592,1.305746559690807,0.9953069957784282,0.624881860924778,0.24696621370142652,0.17059080108373603,0.10101247056702417,0.026953774968638045,1.5482508188615,1.364304823972771,1.03994310809894,0.6529056737278561,0.2580418031435833,0.17824121466498932,0.11950519396014203,0.1325966265815931,0.4484751134111508
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Human Decision Making,1.1907707722837935,0.4256199220191166,0.7730656200953823,0.09037087014364398,0.2209363557020474,0.2832175525249522,0.12881642022049483,0.09321798701359604,1.3266305248262396,0.4741805842629083,0.8612677378243856,0.10068164056271733,0.24614385937275074,0.31553096455800034,0.0989768050833793,0.042432224449616335,0.8321807936299938
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Hypothetical Scenarios,1.3418709235972626,0.324166963280073,0.8420130000644694,0.1845368794752733,0.5531027387575393,0.3065174498073232,0.11080462351780918,0.036944471923861244,1.5269962181123946,0.3688892264232535,0.9581776041864741,0.2099956948955981,0.6294091148843292,0.34880477578759655,0.12085304403910668,0.05971679765845711,0.738122724733404
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Idea Development,1.308349545490745,0.40060586335629855,0.9759856458975883,0.2839511853791351,0.3044858511342009,0.4192711761656095,0.12137193626276027,0.06705883586948555,1.2331356965306663,0.3775760018006846,0.9198785931525884,0.26762751893789316,0.2869816964557976,0.3951682909495595,0.10962341057193209,0.07717357513524517,0.6625089093055815
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Interpretative Analysis,1.2590832776675298,0.2543307588914947,1.156857888182976,0.45574075312784923,0.5391989853383587,0.8557712613911738,0.16786286727341976,0.11696202800396815,1.073210906766675,0.21678513979968228,0.9860765568080823,0.38846195131821804,0.4595996486067654,0.7294378916094215,0.13530009934987725,0.14692738844794104,0.5609328328148987
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Literary and Cultural Translation,2.3815415445675865,0.2377922759431086,1.533010498760183,0.05941015286470752,0.264361777340036,0.1005006468610175,0.08974679924498352,0.003362767988366211,1.3324291626460603,0.13304045182050594,0.8576914813222489,0.03323889957526216,0.14790560442680198,0.056228283335165094,0.08314630151536528,0.01153790771193669,0.9412117776671238
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Logical Deduction,1.3190154385246369,0.11732482337281125,0.9133164701614137,0.08208222972378743,0.26182227899863314,0.3335631071432634,0.10979832729719186,0.01801618814554773,1.4819395113034837,0.13181672202968997,1.0261288260358354,0.09222098229542275,0.29416242513552016,0.37476464152813355,0.13115227178595235,0.050472415812450766,0.8644035597759414
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,OptionBased,0.9838016574594616,0.42460412268255543,0.7000550427800505,0.1966982548657692,0.08024814758832985,0.06704275621303507,0.09724309753299065,0.026868213746845115,1.3223568837106932,0.5707229503263498,0.9409646728865455,0.26438793771431324,0.10786390687986036,0.09011415005152892,0.0843338061124711,0.032634652276852805,0.7221222222877376
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Paraphrasing,1.5607756806261877,0.0,1.0603110991562816,0.0,0.14018030844543697,0.0,0.08810656029071506,0.0,1.321460059847993,0.0,0.8977323172964883,0.0,0.11868629239116371,0.0,0.08526479469128767,0.0,0.9999999999979654
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Personal Opinion and Advice,1.2748281673842274,0.0,0.9759856458975883,0.0,0.3306426840506502,0.0,0.12215236622096459,0.0,1.1651560951729207,0.0,0.8920226687900876,0.0,0.30219785575998864,0.0,0.09549661287340516,0.0,0.9999999999979747
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,PostQuality Assessment Rewriting,1.592265460059583,1.3403472245924206,1.1905468026384056,0.4365221294820002,0.16760689053258768,0.34759383547951417,0.11610978589107901,0.06926901764552507,1.4817999516181681,1.2473588747433317,1.107951053891943,0.4062378331829686,0.15597894227559664,0.3234790564238226,0.1183280228265775,0.08761146729813685,0.612356058584306
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Problem Solving,1.3550763149725573,0.22703115172141408,0.9770296618823873,0.0723192694335053,0.184875479254127,0.03593390153085024,0.11000506740699734,0.06771016158162424,1.0724322683453849,0.1796766206710596,0.7732392079736652,0.05723479726322711,0.1463138477052924,0.028438749244916045,0.07192888554803023,0.012328012704601043,0.9017771700146744
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Professional Content Generation,1.3632027096650463,0.5232636332460561,0.9868067304967882,0.21020697520962028,0.22893577547746635,0.5362547544487948,0.11188908140364989,0.03983992082285162,1.210925216344603,0.46481211033495795,0.8765748080934679,0.1867256609219905,0.20336234771510053,0.47635204943678894,0.10039620976764346,0.040167501870228856,0.7572873751527838
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Quality and Compliance Assessment,0.7669284991036587,0.26258412850105406,0.6423519971337304,0.15885972957886674,0.09954833498299147,0.07542310073966445,0.11908199005841313,0.021556303714558134,1.1760751117894443,0.4026694256921475,0.9850386283990692,0.2436093774604331,0.15265610722564973,0.11566036695412749,0.0973595796096971,0.029131626511361086,0.7627715166265069
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Quality and Optimization,1.013767737888015,0.6132888494487869,0.7502806766433512,0.24600684766134107,0.4362858150530085,0.3703858330936047,0.13539157209096953,0.14412098082006786,1.641809214530834,0.9932287708146201,1.2150887055882564,0.3984110898710498,0.7065701980370674,0.5998443735954966,0.20774733688002378,0.10096569565686186,0.6759848079203364
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Specialized Summaries,1.345172271441086,0.426889671189818,0.9273542526597238,0.2783995598383461,0.31216783361694456,0.20804840161942798,0.08797383644018242,0.005167733289343501,3.0746794775796253,0.9757478198624407,2.1196668632222258,0.6363418510590768,0.7135264768387304,0.4755392037015496,0.24941551911430798,0.11239293681579979,0.6974325777191769
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Specific Character,1.660324015609179,0.2478550381209177,1.0591401082544125,0.11956804690771716,0.11326162602656682,0.507264793695218,0.07332227388468171,0.034846133559583437,2.728849372189344,0.4073657062185383,1.7407649303983412,0.19651778006614906,0.18615277148920883,0.8337223341921405,0.18887753632228688,0.08974944536637797,0.8332354573621092
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Standard Summaries,1.3540605156359962,0.3635133157071835,0.8407996841902436,0.16753546714173584,0.13510131176263127,0.24971204628306837,0.07724079142133522,0.03049076787799415,1.304929147144124,0.35032342761911406,0.810291713065779,0.16145653146895866,0.13019923223568527,0.2406513769694592,0.08188686060782524,0.0333094095696056,0.7653014976675662
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Supportive Conversation,1.308349545490745,0.0,0.8539204256208249,0.0,0.16100419484494027,0.0,0.09057921611461561,0.0,1.3073650838312334,0.0,0.8532778970838641,0.0,0.16088304797146777,0.0,0.08595130586950106,0.0,0.9999999999978706
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Technical and Practical Support,1.486114429388944,0.3329282325579128,1.0751107311570127,0.2002253358954953,0.14589417971359336,0.36308477536207157,0.10978725778079201,0.018549184581071854,1.3545366933620573,0.30345140208435684,0.9799224783641131,0.18249776669129547,0.13297698741225256,0.3309379421282299,0.10669209453920159,0.034169614641957335,0.7901624215947761
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Technical and Scientific Translation,1.260099077004091,0.14373560612340075,0.9686140465454607,0.005466975596075563,0.5770692543545286,0.4439995412650196,0.15711639623010676,0.07225529052802426,1.4169446014068017,0.16162649020480635,1.0891781996717795,0.006147454353588655,0.648897518867751,0.49926451380181713,0.1538093425414218,0.046798876887632124,0.9889785373617708
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Textual ExpansionReduction,1.3276497328854067,0.44409477245282214,0.9184801167889327,0.24694505121524823,0.11656297387039052,0.024633133911607583,0.09768996893260934,0.027291524093547348,1.2026731550285266,0.40229048964348446,0.8320201876505476,0.2236992005551497,0.1055904701909131,0.022314325944484903,0.08027683824175702,0.07388876085016338,0.6595747210686813
rb_CIR-AMS_BTRM_Qwen2_7b_0613/BTRM_Qwen2_7b_0613,core_eval_runs,BTRM_Qwen2_7b_0613,Tone Adjustment,1.6535943450044615,0.3282301606263174,1.1662236109722852,0.02322362415510315,0.16283739521014046,0.01833993958431865,0.10414638834164675,0.0330229874561197,1.5351089060929217,0.30471139693236626,1.082659877924638,0.02155957558754107,0.15116956367638343,0.017025822976564242,0.11611576310965577,0.0034467055780305067,0.9764457442235888
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Analytical Reasoning,1.0181476483353928,0.6431812232714873,1.029353689912785,0.3738726097347833,0.40103989548727215,0.3772638824987514,0.16677412642099654,0.22653109317020095,1.129049184169536,0.7132396137191895,1.141475841660137,0.4145966114978835,0.4447230886007839,0.4183572779922908,0.14375764727488927,0.09858194845732074,0.6219525287028395
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Casual Conversation,0.5939194615289791,0.5220037008940379,0.46101125209402266,0.2934334007482145,0.12652044664453244,0.08110559037432445,0.13910857517281067,0.06549276494604744,1.1183829478433522,0.9829616229245727,0.8681094937999614,0.5525512009316021,0.23824494606810526,0.15272627877169923,0.09575794390663567,0.0631858557992851,0.5231115262007732
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Content Categorization,0.7300571718542616,0.21628157433153172,0.5560203187278016,0.07638007068921526,0.19448244332183195,0.1545173698040489,0.1293757837699272,0.06374400204182773,1.2467859493355244,0.36936398733242115,0.9495671676974793,0.13044128955365986,0.3321356012574143,0.26388356011990177,0.10559716033264688,0.022615351505705705,0.8458794652601255
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,ContextBased,0.627900459867629,0.34787779902979343,0.6352855685212417,0.23780168596283946,0.16413463348479884,0.17409918724290915,0.1453168563063011,0.08697375451218303,0.7187717995437928,0.39822355231702133,0.7272256997653388,0.272216946281727,0.18788861199783286,0.19929526113116486,0.0676885303018866,0.040015887813384554,0.6322985884283111
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Creative Writing,0.7997289042970277,0.7608324156326612,0.8602345533304865,0.5201841383519701,0.34729007736041423,0.6781239479782118,0.16369845354150914,0.19005113437341195,0.7207151404801311,0.6856616515524498,0.7752427899231911,0.4687895889888113,0.31297757971144125,0.611124837184248,0.07268463827376509,0.0813070212680014,0.45610679144740585
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Critical Thinking,1.2756231687661368,0.7458722276848281,0.984807800908793,0.4286212578704598,0.17888110446194874,0.36112825121215997,0.10292624976758868,0.08684142782887483,1.3286688846950256,0.7768886965588507,1.0257602045109409,0.4464451121581424,0.18631972464749513,0.37614546561900225,0.09979633988266823,0.0672457155955205,0.5892739441923945
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Data Management,0.846960354818044,0.4599189209105301,0.713244770160506,0.3184086034055694,0.34322945491743095,0.37170724126098487,0.19332797580330985,0.24336492194770912,0.9362545988811724,0.5084077458471569,0.7884415042474806,0.3519781269169797,0.379415817765118,0.41089599068990135,0.08824413428160327,0.058828140302364496,0.5858171431723384
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Development and Implementation,1.2027991110057914,0.7337972188412197,0.9863988050238799,0.437469734115002,0.5822718866267376,0.3661238854018828,0.1375248480989878,0.038803486858264624,0.9813945176665662,0.5987238941734461,0.8048279805210354,0.35694273031720414,0.4750905052207817,0.2987298299711329,0.087635463400354,0.042521961275721776,0.629129328034497
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Discussion,0.981922621804568,0.35188499223010605,0.8422216633404709,0.226916961914287,0.34050456354121844,0.07449038226614191,0.13625556023241225,0.15352705323834642,0.9472559359361812,0.3394617256543524,0.8124871066796655,0.21890567990264098,0.32848308193063896,0.07186050632185353,0.079700055996972,0.021928857669721774,0.7616618205238468
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Documentation,0.5973389330599125,0.3631051331909807,0.7611708008362432,0.3324070649853277,0.9024199087103686,0.5202939651285713,0.6053073874611998,0.18959066729192775,0.959878652106458,0.5834825867366269,1.2231441178407247,0.5341531044265245,1.4501207901679853,0.8360731944734068,0.31481686599024844,0.1969254502209515,0.5979001494340407
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Factual,0.7975917345901944,0.23508866775166504,0.7159518517891617,0.263204322561561,0.18016340628604874,0.34130600218128093,0.15957067116977197,0.2106557661149674,0.7996186513235874,0.23568609765700588,0.7177712974099726,0.2638732026697226,0.18062125484077818,0.34217336178021673,0.06857572429525327,0.04528341308965822,0.6235618070895699
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,General Character,0.8870322868211687,0.5587630198515708,0.7615655835737556,0.21980938849343656,0.12075008843608248,0.1831554438756154,0.11376417041782999,0.1248977760864802,1.1735375800493013,0.7392396104177564,1.007545999479678,0.2908063006016708,0.15975153179806173,0.2423133853998919,0.10832410975495954,0.05704511585478586,0.6887454191465051
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,General Explanation,0.8905051875947729,0.5450049898638316,0.8141904193132664,0.2675662265673826,0.4858855328485549,0.4706314840660321,0.224573905354426,0.15697559344781004,1.0087704135136766,0.617385408474606,0.9223205180734634,0.30310086541516856,0.5504144801399997,0.5331345885543421,0.11408315480832454,0.09855134915629654,0.6211848543206611
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,GeneralExcerpt Language Translation,0.5847296317895959,0.3743252741518556,0.6163834897808049,0.34639280531683103,0.35754849195321414,0.055032119950957914,0.21042976163796945,0.05061389618821105,0.6730148497184407,0.43084265690126067,0.7094479554151059,0.3986928131217793,0.4115328375654062,0.0633411271207962,0.07399625225270867,0.06254076900907482,0.5091168421078262
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Human Decision Making,1.327903682719547,0.5777838302423876,1.0775134735252072,0.266055366316302,0.47594769371178003,0.4533738386833531,0.1392132152880753,0.3236999057720368,1.1905863700746846,0.5180357296150497,0.9660887847856687,0.23854282275404803,0.4267303753847663,0.4064908621831427,0.11391973270937461,0.048891994867533456,0.730899721530203
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Hypothetical Scenarios,1.3122221999956574,0.47418452870364247,1.0648329332646629,0.2970250331721982,0.19234527361499865,0.3715469535329723,0.1771559302731115,0.07312543678511058,1.7622864949728914,0.6368197330164662,1.4300479733545854,0.3988982998687054,0.25831561001231307,0.4989796533404514,0.26558399978468816,0.10283307378542839,0.7230541320788783
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Idea Development,0.7949202724566526,0.6054601779458791,0.783133187865215,0.39962995713901395,0.3744321326371973,0.676334068348739,0.19563150143143732,0.4078513010734955,0.8128198217635496,0.6190935757645208,0.8007673225020633,0.4086285905494016,0.3828633761875894,0.6915633629381565,0.08221124439816807,0.13287016920096,0.46039152780100595
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Interpretative Analysis,0.6850697495254201,0.41947298420870954,0.6979254189702744,0.3712501244070232,0.5340252804949754,0.3868277169368306,0.3237359898669316,0.1697841753822074,1.2065179294097244,0.738759340492571,1.229158829391845,0.6538311342793333,0.9405043444431598,0.6812657782659808,0.22345119676292524,0.24854173295690532,0.4869048662738121
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Literary and Cultural Translation,1.9832934879413193,0.20970977748301944,1.5582816722424,0.18299812443886232,0.1630660486313822,0.05246751630275798,0.11413221816261188,0.015423255261523161,1.1461773023151078,0.1211946635664547,0.9005561174870929,0.10575756834445149,0.0942385001795719,0.030321824107581774,0.08899864118201306,0.0023449612495510808,0.9578734837982925
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Logical Deduction,1.5473108677473224,0.5932783106169291,1.0682049121354442,0.48466259684964474,0.17909482143263208,0.21275524431525683,0.12383003042572915,0.0546323702395205,1.7900707839481484,0.6863586320773563,1.2357971784088935,0.5607020365313338,0.20719327582162272,0.2461347327928704,0.1335318127749926,0.041785387801578144,0.6904644110634666
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,OptionBased,1.378901894848857,0.43555518625263034,1.1746834561958954,0.0758695245925829,0.1846514626703987,0.10621733442961592,0.11938720518762302,0.02878555512042502,1.3992972283417981,0.44199748161199404,1.1920581954193716,0.07699171048687825,0.18738264186102196,0.10778839468162953,0.11715077638427107,0.02050961706032628,0.9099533954148636
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Paraphrasing,0.7943859800299444,0.0,0.7009204248511006,0.0,0.03526330016274975,0.0,0.09063961477855431,0.0,3.9921690333988926,0.0,3.5224599694746392,0.0,0.17721492884337295,0.0,0.2042086435876912,0.0,0.9999999999994633
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Personal Opinion and Advice,1.1861291872924917,0.0,0.8570050524401607,0.0,0.27313028853329807,0.0,0.10801643461751986,0.0,1.3388496835104091,0.0,0.9673490506084217,0.0,0.3082972784732077,0.0,0.10667617423058029,0.0,0.9999999999981378
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,PostQuality Assessment Rewriting,1.4755019655977228,0.765106755046328,1.1749209194966548,0.4972718981199613,0.4782985803892966,0.08463192039059941,0.1438802998068932,0.12529580885962532,1.3584259636599805,0.704398167714764,1.0816949888948197,0.4578150847335465,0.44034723445408985,0.07791666883101855,0.14718634921222706,0.11350884540439143,0.5789542585983446
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Problem Solving,1.045930854524226,0.36289141622029764,0.889804670857533,0.0586415621224986,0.22312051739339844,0.3915294902918639,0.13585719437123234,0.020433140837107677,0.9423230878899392,0.3269441363377843,0.801662443969847,0.05283264917459485,0.2010186562642208,0.35274538149047174,0.0829094816901268,0.020408528333471065,0.9012516517952508
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Professional Content Generation,0.7943859800299444,0.5758603775062376,0.7757747938329376,0.3290499275708437,0.24630880871253993,0.4727953683942009,0.19549493042904192,0.1492449991407011,0.8356940509915014,0.6058051171365619,0.8161150831122502,0.34616052380452755,0.259116866765592,0.4973807275506992,0.06981836029186994,0.06393841750536011,0.560023168545295
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Quality and Compliance Assessment,0.8745298440361938,0.7159518517891615,0.6690053572290563,0.36339008915189186,0.18379659478766538,0.053215525700149646,0.11869731246739113,0.14893161061258486,1.9884842851428155,1.6279135765465376,1.5211678007311973,0.8262673785038692,0.4179121420388126,0.12100014345077248,0.21334773893154158,0.06234334233448996,0.6838620224542501
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Quality and Optimization,1.0429388169346594,0.6972783314757056,0.8832684935041345,0.21283093574237377,0.2966391553084646,0.2633794517458711,0.13530599939350252,0.10425396374526469,1.0563269917348475,0.7062292728296938,0.8946069850767164,0.2155630401806583,0.30044710338687874,0.26676044599164606,0.08852878167874267,0.05998777135375977,0.7008237071065702
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Specialized Summaries,1.027070331861422,0.13384025289043655,0.7916974502216607,0.11678964582310691,0.07355870984706928,0.0013223737561031124,0.1033568221742468,0.010688062331626003,0.9533629315748962,0.12423524650653461,0.7348815155586943,0.10840827124050767,0.06827979066980899,0.0012274739924298256,0.06221171020607125,0.017378131383408768,0.8244219118920032
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Specific Character,1.2944836914289408,0.07565580762189938,1.0299258280530519,0.1217118148041576,0.2464156671978816,0.240217875048065,0.10171167179433915,0.03624873057303979,0.9754991714779697,0.05701282923942563,0.7761332171288041,0.09171979167190092,0.18569432800298813,0.18102378549467357,0.0682445198315933,0.012123927659302758,0.8676781063683892
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Standard Summaries,0.7659616229290612,0.6680792503560953,0.7048860619737802,0.28193127211768787,0.12395584299633246,0.16851583138380716,0.11128701667070895,0.11337842497993422,0.8222363544095637,0.7171626238516451,0.7566736093840987,0.30264459006919153,0.13306280288994055,0.18089658634261746,0.06461088523717962,0.031107356680223575,0.6460091031598395
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Supportive Conversation,1.5438913962163892,0.0,1.0121160804961076,0.0,0.15323506797994893,0.0,0.08080360373456796,0.0,1.4716162024943897,0.0,0.9647352310618289,0.0,0.14606157491534846,0.0,0.09235423140920906,0.0,0.9999999999981081
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Technical and Practical Support,0.9091519932868936,0.39046090543844714,0.792593132109212,0.272693949718153,0.14404523824056564,0.1791482506753029,0.13991232426411881,0.10001110376366573,1.172093010953201,0.5033883241681942,1.0218234987486408,0.35156131752879505,0.1857053806728861,0.23096073493923855,0.1141274303165778,0.06341699364469738,0.6537875584969063
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Technical and Scientific Translation,1.3853134039693569,0.674918193417962,1.4950689415802856,0.21884617797973172,0.3893923205850306,0.11326999446216585,0.16467164262516293,0.04704996559699065,1.237839496865031,0.6030695975161631,1.335910979509451,0.19554884985957877,0.34793945746499333,0.10121180705622745,0.13444736894973525,0.041559163441323654,0.8198165845754672
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Textual ExpansionReduction,1.4104251480246484,0.4434092849252429,1.0404565833689101,0.5686522532189718,0.16898333725717696,0.19654614781999288,0.11180247356549938,0.04210676687131018,1.394518097483017,0.43840842832834137,1.028722110624148,0.5622388819420662,0.1670775101452539,0.1943294619423238,0.12465099618420222,0.10568024772557236,0.5144658494826672
rb_HFXM_RAMO-Llama3.1-8B/RAMO-Llama3.1-8B,core_eval_runs,RAMO-Llama3.1-8B,Tone Adjustment,1.7567534990169875,0.06924429850139968,1.4097305678699275,0.31798116946420274,0.37037151019421405,0.14340408732851567,0.11918011717939259,0.03458137700534536,1.158444221248979,0.04566130507112742,0.9296092085661285,0.2096841978330597,0.24423161014896355,0.09456399908248961,0.09549532057969279,0.0343047111652115,0.7579652575415208
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Analytical Reasoning,1.3918968375761729,0.31612534094044875,1.1320853478649204,0.1384086497591983,0.6701392061896495,0.2236173202024463,0.14181607697000048,0.021697261265841372,1.377493677874262,0.3128541187862872,1.1203706822221084,0.13697641582289632,0.6632046965703378,0.22130335850065963,0.136266643312644,0.04674490709878748,0.8350442792838735
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Casual Conversation,1.4167307020240876,0.2551660815328032,1.100951838925571,0.1699927101478873,0.5966879877772371,0.38665051490129404,0.13843265056914866,0.006613199615096127,1.4026882170921307,0.2526369023105619,1.0900393206983352,0.168307760377699,0.5907736795283438,0.38281807587694194,0.12624715946488424,0.05386861789134478,0.7971162509422115
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Content Categorization,1.377791802784004,0.04096462232193798,1.1372726161269933,0.09982313186691982,0.6477812255084453,0.062422281633429,0.13837735498649528,0.010277532311777415,1.2736604942959582,0.03786858145750349,1.0513193644197087,0.09227865866155383,0.5988229529378841,0.05770450507810043,0.12294507253311704,0.028762431643511888,0.8801583871645368
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,ContextBased,1.3700640443606347,0.20737402215720913,1.120837582975243,0.15454292451357188,0.6386279582496973,0.4523364614999324,0.1388073226993507,0.029668141278544558,1.3505346226756694,0.20441803280628368,1.1048607314635346,0.152340010019409,0.6295247088448221,0.4458886829917097,0.13093799935253475,0.05782967940086167,0.8104592686393652
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Creative Writing,1.3087671644393466,0.24109856013103892,1.0804888054791157,0.10596751260362391,0.5987137108590912,0.2775615756044121,0.14595708032689464,0.03162684113427047,1.3070225292501363,0.24077716680493655,1.0790484738121808,0.1058262539776249,0.5979156032685214,0.27719157572580144,0.12676278668617424,0.024785026267884203,0.877546428814037
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Critical Thinking,1.413654604010902,0.1068381358725996,1.1036449877450423,0.08752707612315547,0.7060020074165474,0.3769345467864755,0.14636783604510373,0.04031247704041996,1.3528587279176079,0.10224343639500444,1.0561814391982043,0.08376287144423755,0.6756395621327189,0.36072403402282455,0.12508979940925635,0.040820851204767306,0.8802025202853416
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Data Management,1.3410286801873932,0.3035208417644679,1.0806375564924569,0.12868239151298377,0.6645121976289436,0.25148976927314204,0.14300434731620154,0.035545974353002774,1.3280290631069247,0.3005785820073328,1.0701621098113225,0.12743497463116604,0.6580705724481388,0.24905188651305865,0.12407974681263062,0.03685193661673572,0.8436650680897092
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Development and Implementation,1.4232580319545063,0.2680706878320216,1.1692256884428232,0.08382471289793547,0.7984350013737405,0.1273204470335687,0.15571967831420586,0.005673401175862125,1.3299817792496245,0.2505021031765371,1.092597987537132,0.07833108143573009,0.7461078595031367,0.11897622980424949,0.15003951211804945,0.040425950087783324,0.8929269370075773
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Discussion,1.3978239532601164,0.13801176329890952,1.1094262180218561,0.08846595292189519,0.7601713431609414,0.1405626738464295,0.13716486336401068,0.01448313825179548,1.412991211537465,0.13950927666916235,1.1214641817075939,0.08942586346970693,0.7684196744832588,0.14208786618085134,0.14643187514117656,0.03496345906984605,0.8873164213264727
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Documentation,1.2878346925935213,0.2526151709852831,1.0905184272192625,0.09990232680221856,0.5656269005221415,0.02768488211867215,0.13165286216708383,0.01937449080928688,1.4810840330581754,0.2905219888905841,1.254158969003604,0.11489342688396387,0.6505034969545928,0.03183920816769392,0.14739076297059833,0.022979698078258226,0.8910027352246096
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Factual,1.392647105384267,0.2440527396254093,1.0806940870877193,0.11943429873960154,0.6635368494784214,0.25404067982066203,0.1306454932420138,0.0487988116805359,1.4010628331920951,0.24552754355779194,1.087224691463585,0.12015603689223608,0.6675465842447413,0.2555758405984503,0.1302248150598111,0.02792181568584612,0.8642843476922264
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,General Character,1.2851712418747874,0.34589221622658206,1.0923253221904226,0.12116564591064116,0.7507179687789558,0.08613074436920265,0.1430970909271002,0.01558616942604732,1.3586182342961357,0.36565980995731096,1.1547512519393108,0.12809021128135734,0.7936211828930869,0.09105307994815759,0.14954386796200153,0.03275281546791728,0.8630657196593068
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,General Explanation,1.2090190593532362,0.27706452318154984,1.0571940320764164,0.10165711984226111,0.6551338500277675,0.2670203129006899,0.14345916890134525,0.027493640905182914,1.2250993398991878,0.2807495562896902,1.0712549986887943,0.10300918703583473,0.6638473074559997,0.27057175529499566,0.1230544115851625,0.023015253667578783,0.8807214116780079
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,GeneralExcerpt Language Translation,1.3459804477208142,0.08354232043127796,1.0546524998764975,0.054605577770509006,0.6507072699600123,0.2541157066014714,0.13390194942291972,0.04501149700411297,1.4480182163597344,0.08987560111017623,1.1346049152025026,0.05874518568262754,0.7000369002501658,0.2733800278043713,0.13190450358770278,0.016668046737875003,0.9291783199769043
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Human Decision Making,1.3072291154327538,0.1634458419932996,1.08432142353213,0.03723412405391424,0.6071167103097451,0.16477756735266658,0.15187609190241658,0.04918624142911207,1.3929109574185679,0.17415883839276525,1.1553928644724247,0.03967462074726669,0.6469099473372772,0.1755778511912377,0.13541488502555166,0.038443432256488524,0.9405724393411844
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Hypothetical Scenarios,1.387695337850846,0.1549490590666338,1.0374995021139464,0.01893540482608591,0.690546490569809,0.29688097166283506,0.11931847167176823,0.04434413314140928,1.3712223661551006,0.1531096989458165,1.025183614421233,0.018710627543032876,0.6823491921546034,0.2933567745931948,0.12269962077289609,0.014938509582860415,0.9687551605056046
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Idea Development,1.3719397138808698,0.24564471413070899,1.058394460569367,0.1143202336907494,0.6374275297567467,0.3203643540561803,0.13972613928004196,0.037714145984060504,1.366165045169364,0.24461076429276396,1.053939543699447,0.11383904529019007,0.6347445162287497,0.31901589975244365,0.12047424943306001,0.04417717085556375,0.8463081987746333
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Interpretative Analysis,1.427984719145499,0.08436761502018131,1.0977999326026093,0.09930667147471617,0.7064521681014039,0.3650052886377793,0.12464389232256246,0.03756787685582513,1.5004187266383866,0.08864713172410355,1.1534854364302778,0.10434396640459309,0.742286698367417,0.38352004965563763,0.1461500522615084,0.04968197322923862,0.8668459232933557
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Literary and Cultural Translation,1.4561197619490278,0.035487667322851024,1.1829514211764556,0.01665594533968906,0.7355625590554549,0.16708464086255592,0.1390477816001372,0.005995755834661831,1.332910768752852,0.03248489249897446,1.0828564582401319,0.015246609164423663,0.6733232263423826,0.15294684058185293,0.135950763988254,0.0002801164683961521,0.9964116779309139
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Logical Deduction,1.445015798389235,0.09333331532690581,1.1356011861767392,0.08718945560951319,0.6539334215348169,0.06272238875666669,0.13412268355760837,0.03022634147688874,1.481433689989648,0.09568554051646516,1.164221081506316,0.08938684068075298,0.6704141247118262,0.06430314459145103,0.1417239592801347,0.03529864196823118,0.8892526274332332
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,OptionBased,1.405101550998629,0.3268166572057898,1.0845871433808298,0.12224780301995453,0.663836956601659,0.10653802874936213,0.14409830106910437,0.012950293210092778,1.494187715102635,0.3475374672675715,1.1533520722727302,0.12999855088116785,0.7059255074342222,0.11329274644626985,0.13506714141230924,0.04843049088724016,0.8423896771520818
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Paraphrasing,1.3365270733388284,0.0,1.156558666949501,0.0,0.4831724684125999,0.0,0.14086282921154358,0.0,1.1515601293191553,0.0,0.9964982188878679,0.0,0.4163044365563804,0.0,0.10937057462049848,0.0,0.9999999999981914
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Personal Opinion and Advice,1.4040511760672973,0.0,1.1408905742238025,0.0,0.7388637374110689,0.0,0.13855576144845838,0.0,1.1846475901309248,0.0,0.9626096914380089,0.0,0.6234054433904749,0.0,0.1112153674167965,0.0,0.9999999999981375
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,PostQuality Assessment Rewriting,1.4414145129103835,0.11299033189897134,1.14681508481119,0.14513097116682472,0.5957126396267148,0.4225508295185967,0.1398213500066986,0.009349335117630364,1.446563571228438,0.1133939588939219,1.15091176740931,0.1456494117872018,0.5978406597727357,0.4240602765541621,0.13391083297061723,0.04038374525429034,0.8367029826224112
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Problem Solving,1.406001872368342,0.08072881615092498,1.1582384332087339,0.061038454342944526,0.7460663083687723,0.06842442409818184,0.1345789151017872,0.0036412164943632197,1.406001872368342,0.08072881615092498,1.1582384332087339,0.061038454342944526,0.7460663083687723,0.06842442409818184,0.14437943527656605,0.01822104927986601,0.928345683793208
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Professional Content Generation,1.3259201622018981,0.344072816791954,1.1041529815734394,0.13214091770057323,0.7225078991946177,0.16460875709584533,0.14937586780541384,0.022778636910298855,1.3330376043608883,0.34591977443073896,1.1100799937760253,0.13285024045271254,0.7263862685931124,0.16549236482796348,0.13548644046781932,0.04143522409006617,0.8419522280920383
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Quality and Compliance Assessment,1.4291101208576402,0.13684884819636367,1.210267421622815,0.049751091985617446,0.6851445623515314,0.37198277925305445,0.12969987673231143,0.01500488284696938,1.253801111204273,0.12006159317705745,1.0618038567765302,0.04364812305559718,0.6010978447877688,0.3263516332082521,0.13721744702939198,0.01606261125593611,0.9409669665417044
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Quality and Optimization,1.2866342641005708,0.19730167683354605,1.1089166611355257,0.0948317668658496,0.6881456335839078,0.1586066146310925,0.1383262416534391,0.0360574751098795,1.2653803778868977,0.1940424542975141,1.0905984885251478,0.0932652427659979,0.6767781693380737,0.15598659506985246,0.1335565979344676,0.022651134748528867,0.8925237303968159
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Specialized Summaries,1.161864727614522,0.004951767533421192,1.0297842481540453,0.04915504589363162,0.7568701648053273,0.0006002142464751614,0.16660247225287605,0.004242888297566805,1.3801749340981178,0.005882186855900473,1.2232770072548715,0.05839110234986089,0.8990833485200704,0.0007129923461698384,0.1733074997722267,0.005994086955643629,0.9606838355195483
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Specific Character,1.5200425791986452,0.05972131752429033,1.1936260648238388,0.005339405900936178,0.8048873045233497,0.043965693554314234,0.13681751257922015,0.010436872506098593,1.3785113272466276,0.054160662215612776,1.0824874732942915,0.004842253510482841,0.7299442013179576,0.039872045299433445,0.138920243811139,0.0035041940909572022,0.9924298656013736
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Standard Summaries,1.190599984664526,0.3079849352226278,1.0823488444200158,0.16386792462183963,0.5792067478486447,0.322690184261272,0.14957935782510667,0.032866093926178674,1.3745336923249118,0.3555649887827692,1.2495590229017517,0.18918359346992908,0.6686873845074245,0.37254202600601466,0.15396244068611198,0.046518648328127854,0.8173236424376897
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Supportive Conversation,1.3340511895721179,0.0,1.0528883284888542,0.0,0.5109323773120816,0.0,0.1137200318214413,0.0,1.791622377390699,0.0,1.4140224190491184,0.0,0.6861789769996434,0.0,0.17382097477100622,0.0,0.9999999999987405
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Technical and Practical Support,1.362636393060503,0.18148978277796268,1.0845308732952228,0.1162081471647971,0.6584350283833815,0.24593778749324569,0.14356944741123323,0.024902951892098013,1.3274925415706462,0.17680896696726078,1.056559660915047,0.11321101463063776,0.6414532840449284,0.23959478863938877,0.11844972286226385,0.03869908999766436,0.8509727568372608
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Technical and Scientific Translation,1.2491208736958659,0.22500531564742055,1.097608458005752,0.07081694477510414,0.7757769135692987,0.0852304229994898,0.1585166564432297,0.02286592967335138,1.4305218668185775,0.2576812468369818,1.2570063742002175,0.08110118898448238,0.8884375099347763,0.09760783474718626,0.16999127979239853,0.025525838222522357,0.9136960263159598
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Textual ExpansionReduction,1.3894959805902718,0.10818861792716916,1.10470682510122,0.1272925725004206,0.6429044847558336,0.1567684585012621,0.13870206985517447,0.013418580897138405,1.411476707966841,0.10990007628985876,1.1221824132949116,0.12930624031398663,0.6530747251926492,0.15924841151710123,0.13827239106488542,0.052201451651704395,0.8381451145150388
rb_LxzGordon_URM-LLaMa-3.1-8B/URM-LLaMa-3.1-8B,core_eval_runs,URM-LLaMa-3.1-8B,Tone Adjustment,1.3208464761496619,0.23310820797483678,1.0905538565324226,0.09462752729586832,0.5376419112802315,0.27234721433815806,0.12310922544695829,0.00745795749965078,1.5856860279876421,0.27984813910579964,1.3092180236664857,0.11360105099684259,0.6454431171007919,0.32695485836950533,0.158692076985064,0.03415424210960122,0.8836646662996519
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Analytical Reasoning,1.105003043276489,1.5411293600126164,0.7982938192669318,0.7534372455457313,0.07530199866564859,0.13446785476008674,0.08219993565290207,0.04896266995511936,1.5110946742297897,2.107498601369489,1.0916689742210823,1.0303274873657002,0.1029756884697092,0.18388515798162353,0.12592360410540182,0.1187923490192127,0.3892300620155825
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Casual Conversation,1.4380530259462718,0.46400822642063133,0.9993471675296633,0.23251583810204157,0.20869411058765464,0.1353284490305513,0.10036643392833483,0.06282485767522547,1.2450417664572113,0.4017304031562221,0.8652177216806116,0.20130824425570304,0.1806837991417557,0.11716506150532408,0.09405087588893074,0.016679922905194378,0.8176867659634345
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Content Categorization,1.7039766555198195,1.1861140532677732,1.0016361092351351,0.8630804316903418,0.024096639573007547,0.10068952964435296,0.09848968233613209,0.044944599300917654,3.2572968384479477,2.2673582664600422,1.914712928324481,1.6498519227739155,0.046062783574011376,0.19247663136283327,0.1773534773549713,0.13716503856124151,0.44257312424303935
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,ContextBased,1.839950550253219,0.6115060163069705,1.207721543440444,0.32944325598766244,0.07185962158379036,0.08406930279600625,0.07869642833233409,0.01722319410564177,1.7704864871221124,0.5884196934077204,1.1621261628869837,0.3170057113325112,0.06914668927722001,0.08089541567387344,0.11332119921275613,0.021168364206246376,0.8011187669453252
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Creative Writing,0.7470227203341859,1.2809945715864905,0.634788378315042,0.807021530084499,0.06777179879908372,0.15108808160843348,0.10357652285735042,0.05847270826411585,1.0779821533936356,1.8485238121627834,0.9160237357963896,1.1645627142991506,0.09779727927969044,0.2180258392830559,0.09015085063233053,0.1372260939166065,0.25020320584294625
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Critical Thinking,1.4991552191492552,1.1874049446734698,1.0260913297541694,0.4763150233057294,0.17211885409291106,0.1368344890038643,0.07923316253692758,0.011325514369108114,1.1913438813771648,0.9436031689610466,0.8154109806791688,0.3785165014102564,0.13677886123733235,0.10873919468367921,0.07088598599726181,0.06419620273056725,0.5412966422751924
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Data Management,1.1479251825159087,1.0424217036711445,0.7773123516625331,0.6057791797814658,0.11661052364794723,0.15119565589224157,0.08899775507297392,0.023357009881078167,1.9071253032259534,1.7318452787541276,1.291401283763668,1.0064216897800191,0.1937329049477025,0.251191510981546,0.14182153721720037,0.14357842801425566,0.4144945727789505
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Development and Implementation,1.2776597687884403,1.517348719898295,0.8719612864536078,0.8971716747097568,0.07315051298948719,0.0815278603410406,0.08657240512199138,0.014647773771633787,1.6881101570547932,2.0047995940917316,1.1520803426539084,1.1853896113012699,0.09665024053189017,0.10771882506339156,0.12374200993669554,0.15378271195763674,0.3244026962323527
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Discussion,2.0142208900222913,0.2470981299071353,1.1919170882443084,0.107929876579546,0.09767744969772701,0.15490696868361994,0.06970850297087683,0.011447870777125002,1.7985139533522885,0.22063589782365955,1.0642723075038187,0.09637145060577534,0.08721697658854292,0.13831767212279938,0.11209276100881105,0.025715986350306808,0.8782269795223584
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Documentation,0.7022987118409811,0.41873289972291017,0.5643810095626606,0.2973965780237834,0.0970320039948786,0.08885635842546534,0.10862814091709505,0.0069109229177967135,2.121104490964813,1.2646701740186161,1.7045611415982092,0.8982064278463979,0.29305908720996576,0.26836674726766263,0.18472874967530833,0.12106029801147972,0.557569466533257
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Factual,1.853720058580652,0.6809452165050791,1.1862096748533804,0.3103279033898785,0.10241071818528208,0.1966457908011509,0.07601262113158314,0.032642729562891826,1.6223931835986445,0.5959696409110617,1.0381818343566018,0.27160189199532914,0.08963082119231137,0.1721062406927996,0.11219833949897506,0.023332201352813053,0.7931705900302908
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,General Character,1.7414125062850276,0.8738948221485123,1.1052361208914063,0.3993379660437708,0.0942350726158688,0.1191923064593409,0.08468637950333668,0.029867573987533397,2.674830086821931,1.3423127228983764,1.6976556780963052,0.6133878116350915,0.14474617964269904,0.18308078429692987,0.16985561934164883,0.07245230178733453,0.6762237043981357
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,General Explanation,2.5860857827459887,0.7026752218343093,1.5424957028257062,0.30139550261562076,0.08519883277599097,0.08605942704645551,0.06821770917495407,0.016289035459594536,2.6564151143213217,0.7217846724936305,1.5844443081122508,0.30959203823721215,0.08751583904086883,0.08839983741501903,0.1584472106231425,0.037975465271199715,0.8063157521064399
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,GeneralExcerpt Language Translation,0.8399937951153099,0.8338502950009583,0.639728898051759,0.3215360925655607,0.20359777889224734,0.14454622047435522,0.13172020156579212,0.09024596742883012,1.683881734592916,1.6715662535867668,1.2824235283461047,0.6445628008587854,0.40813942087903154,0.2897625457195132,0.16130160122880888,0.10112949592232143,0.5723869043910429
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Human Decision Making,1.7865937054844165,0.5675619213713745,1.1861260059659742,0.13927881578594747,0.16953707128151738,0.3298227541555408,0.08299944359189676,0.01964236407199263,1.6429953847737302,0.5219438613960863,1.0907905628346521,0.12808421457581276,0.15591044836243972,0.30331309307058385,0.11288559274807713,0.017363844990914173,0.8753089211470776
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Hypothetical Scenarios,1.746576071907815,0.5972524237024015,0.9905440053047031,0.2628159280413145,0.06712635309623531,0.03485406795381449,0.06327317398120003,0.0076008205171367305,2.8607711522628003,0.9782582802022093,1.6224427673094275,0.43047436489525626,0.10994833696797163,0.057088559579523745,0.18326667905405214,0.01857343810041001,0.8635798908774003
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Idea Development,1.8993315549152734,0.9024406668658942,1.1680774316826203,0.41349761791091755,0.07874437574750681,0.13446785476008674,0.07858170985967106,0.02928484723948821,1.9677476618173766,0.9349476175229936,1.210152924099476,0.428392277643994,0.08158083872056637,0.13931154153102182,0.12108935484804462,0.05479944049409574,0.6721638917745906
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Interpretative Analysis,2.2952049193289685,0.2486041698804482,1.3861006232160746,0.20432539939522698,0.15490696868361994,0.04690238774031827,0.0655991505678219,0.008636392616304611,3.203680953738343,0.3470053751447928,1.9347397389948329,0.2852004127800696,0.2162214366977509,0.06546704611126347,0.18846221832448445,0.03072331015363622,0.8565670404638773
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Literary and Cultural Translation,0.19492460226022176,0.20271029105083077,0.4506047694756009,0.22257306080798944,0.044535753496540736,0.3501744639734799,0.38124443676284436,0.2478265906548619,0.9638621894097716,1.0023608240447233,2.228148189757112,1.1005781476861942,0.2202201691035571,1.7315409220139744,0.5761699294796355,0.3650142345690364,0.5740202173280435
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Logical Deduction,1.447519562921382,1.319075868054547,0.8853004976458084,0.6712605427878028,0.07616259293611313,0.020223965355917042,0.09129557309535952,0.0669810876951491,2.1975660192535265,2.0025679643346272,1.3440276320191813,1.0190807756124753,0.11562698733884488,0.030703211327263905,0.14359170917147757,0.11670126122124314,0.45633363069057364
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,OptionBased,1.1754910927417264,0.8704911045748975,0.7696581425521338,0.3688273965596166,0.08907150699308147,0.0692778387723967,0.10204709172492055,0.021736799731003398,1.971730645128578,1.4601335542227416,1.290999613109117,0.6186591162195817,0.14940565779738385,0.11620440050907634,0.1435263131774268,0.10785486317408621,0.5569832059313401
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Paraphrasing,0.11445903797178585,0.0,0.159466923047262,0.0,0.024096639573007547,0.0,0.21272812693011667,0.0,1.2590494176896443,0.0,1.754136153519882,0.0,0.265063035303083,0.0,0.19432555486755465,0.0,0.9999999999989736
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Personal Opinion and Advice,2.160521916001266,0.0,1.2865167181553046,0.0,0.06282338174391253,0.0,0.07926034147054906,0.0,5.392703986197428,0.0,3.2111703116376433,0.0,0.1568083612795906,0.0,0.27001627745195456,0.0,0.9999999999994255
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,PostQuality Assessment Rewriting,0.3019072275073468,0.10267965389480224,0.20294486275302337,0.029579939872668853,0.06239308460868025,0.06368397601437709,0.11800707837868174,0.05447143787463682,1.0634597283545306,0.3616861965845001,0.714867578349133,0.10419450730428859,0.21977788791934003,0.22432501663491258,0.055416434615803256,0.027017248851365028,0.7989841227079971
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Problem Solving,2.4823841731550096,0.49010843702956386,1.4598069366719035,0.2939168487600474,0.17814301398616295,0.3003474003921298,0.0693728061583132,0.019142326249317998,1.586923813793017,0.3133136113555637,0.933216710127736,0.18789341779419833,0.11388220816611354,0.19200430265687743,0.0927929494106291,0.013724910315381589,0.8432127137351851
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Professional Content Generation,1.93741285138333,0.7889632442338572,1.192592415692659,0.3810867561737299,0.0654051645553062,0.09401992404825266,0.07109566970395198,0.019965990565108255,1.9842500127692715,0.8080365144309514,1.2214234639648764,0.39029956896590756,0.06698634135278829,0.09629286569463315,0.11697618155905809,0.055319731636559705,0.6829136400230119
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Quality and Compliance Assessment,0.3875901445604741,0.12129000499359821,0.30627295052522424,0.05011616946908426,0.02194515389684616,0.03141169087195626,0.12669917003614795,0.1452000274871011,1.2627644616233684,0.395161512761684,0.9978339307836612,0.1632779332663855,0.0714970719320475,0.10233894609881308,0.11219026173500213,0.015911315570242,0.8590320053528795
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Quality and Optimization,0.5443258760688312,0.8894241785251178,0.36575256494743597,0.635946856235982,0.08347764423506186,0.08003526715320364,0.09753794931852411,0.03864761697759378,1.5888841913182363,2.596224208264659,1.067629693771147,1.8563253205770986,0.24367077716659116,0.23362249769580387,0.10714236662645404,0.15903605060163223,0.20159569008210573
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Specialized Summaries,0.8423066422171834,0.32078651431566296,0.5601213667413161,0.14825529213482092,0.1142438894041697,0.0892866555606976,0.08519866651244778,0.007914614270991716,1.6300193354017716,0.6207813101108661,1.0839385708234728,0.28690144496460734,0.22108308227288398,0.17278621307579445,0.11053060440118095,0.047804155990463004,0.720080055912041
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Specific Character,1.3027245769157205,0.5575575129772237,0.8158433684003984,0.3455883630825234,0.08519883277599097,0.04066307927945023,0.07448365389610712,0.006667724593239277,2.4123104244718157,1.0324529256843455,1.510731813310838,0.63994064870863,0.15776629695967614,0.07529755082166362,0.14944680923931541,0.06968879977212,0.6415120138269982
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Standard Summaries,2.010348215805201,0.8593033790588587,1.2695199813136295,0.27576070019288546,0.08347764423506186,0.1265073577582896,0.08122334193865122,0.038092291496746034,2.612952593925914,1.1168806357170484,1.650060174717342,0.35842031306164834,0.10850017192243737,0.16442809559379684,0.17536217843970114,0.040494912135750016,0.7994280113855726
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Supportive Conversation,2.5069111098632493,0.0,1.4187852764464264,0.0,0.020654262491149326,0.0,0.06326747923059428,0.0,4.961594904937681,0.0,2.808012526300219,0.0,0.04087822784706637,0.0,0.20996805941035268,0.0,0.9999999999993373
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Technical and Practical Support,2.1678369673002145,0.6985873990496028,1.2713487441383668,0.2937614636834356,0.05077506195740876,0.1858883624203439,0.06835390142326003,0.013024330269413742,3.0947945776348695,0.9973003169492283,1.814971909323215,0.4193725813513529,0.07248625648291279,0.26537341356456207,0.1714719061550929,0.02716374613982353,0.8286453953061479
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Technical and Scientific Translation,0.12527025349449683,0.06664226881909899,0.171851412470666,0.10408857819523294,0.03981593179446169,0.008377347351553405,0.34097468599036745,0.07614834493657796,0.2925850211996629,0.1556517137253681,0.40138139548931856,0.24311245493913597,0.09299530353930893,0.019566387742313215,0.03474858742510123,0.031236793172627952,0.4849414213775415
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Textual ExpansionReduction,0.3358469140487927,0.28109160359048535,0.25065797960099934,0.18482149072545923,0.016996736841674964,0.06363018887247304,0.12427335824966468,0.0318400706237327,0.985381446460426,0.8247282894302028,0.7354354385111211,0.5422698862183573,0.04986875994258402,0.18669222472176233,0.07806104745369963,0.10634997356075179,0.38410752423737365
rb_PKU-Alignment_beaver-7b-v2.0-reward/beaver-7b-v2.0-reward,core_eval_runs,beaver-7b-v2.0-reward,Tone Adjustment,0.17814301398616295,0.10327131245574664,0.164265931374922,0.04098580213087444,0.03184198800718854,0.007745348434180996,0.1377111885128835,0.03023506212716437,1.2036171311349588,0.6977490615275121,1.1098571009921994,0.2769191587937315,0.21513929397098297,0.05233117961456343,0.11864972435918014,0.057950129968463215,0.7187174510178558
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Analytical Reasoning,1.0262413168380453,0.5854199031344237,0.8243775949158385,0.2751450880391756,0.2647194916225347,0.28647725805726354,0.14664976555382464,0.05984443099638151,0.7822987087371985,0.4462627130450936,0.6284189862883032,0.20974174743969942,0.20179436656471905,0.21838020491250415,0.05854208471400091,0.035749150052455325,0.6494599275627528
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Casual Conversation,0.4913628919842938,0.4505670799191771,0.4641656839408827,0.2681443252280013,0.11604142098522068,0.06980616731142181,0.16397576462150276,0.06406413892949478,0.601273012559728,0.5513518214800458,0.5679922185066064,0.3281239769237384,0.1419980546266516,0.08542070473634511,0.04328666425829136,0.053259554191578046,0.45556120275397344
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Content Categorization,0.7977847692733921,0.5874596937376797,0.5592551816926606,0.06849667210933164,0.1885673091009836,0.07252588811576291,0.1344484318833391,0.21974329452721675,1.1241512657943253,0.8277841139030941,0.7880413923851128,0.09651803797224001,0.2657084810059315,0.1021955696176659,0.09008867943808874,0.03858870295708383,0.8265553762934926
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,ContextBased,0.718006292346053,0.33089936452816837,0.5737603593158134,0.13341741501295568,0.10878883217364438,0.13326631941271438,0.14128586001218463,0.14591895382289044,1.0041291005741793,0.46276151731007,0.8024017055093329,0.186583753326389,0.15214077281426958,0.1863724466974802,0.08255531559139562,0.03482110468185523,0.7409747943855564
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Creative Writing,0.9573417231280705,0.6454804042302901,0.8891472422192213,0.3714968636179685,0.2212039587530769,0.4357502676205272,0.18905866832534812,0.18582024949936463,0.9677476114229411,0.6524964955806193,0.8988118861563867,0.37553487300512023,0.22360834960908865,0.44048668357292425,0.10563631852478084,0.132897140079194,0.5340352810475771
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Critical Thinking,0.6092174601724085,0.54757045527401,0.6559563658470113,0.26673409962575034,0.1885673091009836,0.23933543078201766,0.1747144303171233,0.0731136215050725,0.6295247088448221,0.5658228037831438,0.6778215780419117,0.27562523627994207,0.1948528860710164,0.24731327847475157,0.05759660652088017,0.05802875430017529,0.5602289044947822
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Data Management,0.8775632462007315,0.5729545161145271,0.7041054971238649,0.32505700131884296,0.15955695385467844,0.19219360350677175,0.11725220058319835,0.10399323143283573,0.9170042909737979,0.5987052808837194,0.7357506880058364,0.33966630474890325,0.16672805290432688,0.200831518271121,0.058280423440345064,0.05580664876741409,0.5363511460556354
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Development and Implementation,1.0733831441132913,0.3644425877817089,0.8188531620320207,0.37653023580100253,0.2633737964328867,0.254995073225763,0.14490397916785314,0.05214784087931151,0.9691711883741367,0.32905981226892145,0.7393528550386206,0.33997390222808954,0.2378035249345482,0.23023826999996075,0.07603389084766071,0.07086378175050806,0.540198505909548
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Discussion,0.6055911657666204,0.45781966873075347,0.5765808105203152,0.21505940434326914,0.1015362433620681,0.0797784769273392,0.13478008847331713,0.08721090247405261,0.7459599790237841,0.5639368104895375,0.7102253692501896,0.2649076106480005,0.12507113420758056,0.09827017687738474,0.07383402994825577,0.05550481807354998,0.6073840355537278
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Documentation,0.9138261902586129,0.17406213147783112,0.6273489322013492,0.10919175377428747,0.2574669028109584,0.05439441608682222,0.09813297824729073,0.0008879818969230735,1.3489815189531904,0.2569488607529886,0.9260865189638966,0.16118782700013878,0.3800701898637957,0.08029651898530893,0.09981877565342251,0.030372384962403964,0.8013666048844392
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Factual,0.8485528909544262,0.5802071049261033,0.7311012443669545,0.38096237340807687,0.0870310657389155,0.1287334514054792,0.12982912681075454,0.05171399918660352,1.160520865570024,0.7935185405607001,0.9998884665606877,0.5210220695139874,0.11902778108410503,0.17606192618690536,0.10217880248366984,0.10469402509168585,0.501161026502656
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,General Character,0.794158474867604,0.869404083787708,0.7238486555553783,0.598086750954642,0.27559837483989913,0.22256381915524748,0.14007699266041984,0.12794066780736382,1.1275837887433156,1.234421065530639,1.0277545796435141,0.8491918754012475,0.3913076161848949,0.3160066439914201,0.14713750154712713,0.23072424716710413,0.3388317982194835
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,General Explanation,0.8195425357081211,0.47867086156403515,0.7514487851994324,0.33543223253540344,0.05802071049261034,0.13054659860837325,0.127297888372356,0.074453373390024,1.0299656192007467,0.6015728395331794,0.9443883381560435,0.421556724672872,0.0729179199434157,0.16406531987268533,0.09997596996877878,0.08326506032137154,0.5592074085512168
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,GeneralExcerpt Language Translation,1.1096460881711727,0.5493836024769041,0.8145060157000819,0.16882415066947032,0.4170238566656368,0.3046087300862043,0.13873427465231486,0.14181227920205447,1.6917555114740832,0.8375848365631493,1.2417878600017644,0.25738763954525834,0.6357904699984299,0.46440347373798363,0.22684025921840067,0.12284751720800202,0.7409663712646641
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Human Decision Making,0.5729545161145271,0.34268482134697986,0.5431383176669357,0.22211053235452388,0.2502143139993821,0.1164947077859442,0.2561239536164207,0.16871238630025884,0.6660596249831378,0.398371104815864,0.6313982942878127,0.25820349386213415,0.2908741400242817,0.1354250978011601,0.05331191419118769,0.041760924296613045,0.58429529292591
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Hypothetical Scenarios,0.7796532972444514,0.20669878112992435,0.5920932921450757,0.04532868007235191,0.27559837483989913,0.08884421294180955,0.17324296587234733,0.09889145637720953,0.8951574894288146,0.23732082277880207,0.679810816907309,0.05204404008307062,0.31642776370506936,0.10200631856281844,0.056087751383323114,0.021228896999861024,0.880422508407815
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Idea Development,0.9573417231280705,0.46053938953509466,0.7754226204376985,0.21986534616969022,0.16499639546336065,0.2289098343653767,0.1314536100498097,0.07106646868097868,0.9783822005594568,0.47066113436004176,0.7924648758319337,0.22469755158001314,0.1686226898691488,0.2339408197360444,0.0829523582934848,0.04152822529526684,0.6962580499904499
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Interpretative Analysis,0.6374345635174476,0.33560221508567484,0.6126045198778147,0.23825887463029932,0.11150855297798548,0.09111064694542717,0.20457793258267226,0.0523593860503892,0.954227998504992,0.5023904386795617,0.9170578728150789,0.3566692207745328,0.16692628453847327,0.13639098858631352,0.1051626293456,0.06539950810995324,0.6196743060834898
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Literary and Cultural Translation,2.6888973018919105,0.8467397437515318,1.539865627257872,0.11442973458264838,0.18494101469519544,1.5656526096990322,0.08143197031172211,0.39717629377706787,1.5436262288638745,0.4860913343758797,0.8839969341665561,0.06569114392707587,0.10616984176946406,0.8988005722346295,0.10377002069439056,0.09368979804549055,0.8716909340018809
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Logical Deduction,0.8195425357081211,0.13779918741994956,0.7790489148434867,0.4456312903112988,0.2610931972167465,1.3018396916779444,0.14328204789873178,0.2770221547580557,0.6570470329384074,0.11047693474185616,0.6245823196590022,0.35727336205992066,0.20932471845825368,1.0437163045349038,0.051545328542196844,0.15082821472883456,0.38405833133413086
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,OptionBased,0.6817433482881715,0.5802071049261035,0.6527329930418663,0.27398668843732654,0.24658801959359394,0.05802071049261032,0.16181292437498263,0.2160227314823029,1.1321809176928561,0.9635582278237074,1.0840030063016708,0.45501360758341736,0.40951224682507564,0.09635582278237076,0.18702379550673476,0.11071161310043287,0.6118910115198697
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Paraphrasing,1.0733831441132913,0.0,0.7800562188450945,0.0,0.02901035524630517,0.0,0.14613873011677958,0.0,1.9197044692795404,0.0,1.3951005452421883,0.0,0.05188390457512271,0.0,0.2537852472476614,0.0,0.9999999999987871
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Personal Opinion and Advice,1.3997496406342245,0.0,0.9335693486901261,0.0,0.29372984686883985,0.0,0.09908829669555885,0.0,1.4149643106411183,0.0,0.943716841610671,0.0,0.29692256259567507,0.0,0.10729561883294836,0.0,0.9999999999980971
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,PostQuality Assessment Rewriting,0.942836545504918,0.38438720701354345,0.7486283339949306,0.24981139239873895,0.25384060840517025,0.2828509636514754,0.13657552133456052,0.03179090354150804,1.461396645532623,0.5958001708709924,1.1603739176921426,0.3872076582180455,0.3934529430280139,0.43841899365978687,0.12235183403628824,0.0570950644229955,0.6776499228914513
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Problem Solving,1.044372788866986,0.1323597458112673,0.8799807758045901,0.252657026203281,0.10878883217364438,0.09473694135121531,0.11553800992054064,0.03734313225638053,1.088253998483246,0.13792107966888367,0.9169547579812536,0.2632728676403936,0.11335979150867147,0.09871748510546807,0.08954017663144909,0.018676256183237316,0.7853507841670337
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Professional Content Generation,0.9555285759251765,0.5820202521289974,0.7669612668241929,0.2267944959620003,0.13779918741994956,0.13580986038630555,0.15207624726615376,0.0598708849503882,0.9453633783089512,0.5758285473191145,0.7588021044111696,0.22438178855814916,0.13633323861760968,0.13436507463751504,0.07545985552948525,0.055237948772616785,0.6562307286210896
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Quality and Compliance Assessment,0.8267951245196973,0.28647725805726365,0.6805345834862421,0.2252331747595081,0.7724007084328751,0.322740202115145,0.20447019739552375,0.05344866434494394,1.2014366653176851,0.41628726561446117,0.9889018166284456,0.3272919570724103,1.1223947794415217,0.46898185619857025,0.18345091232905097,0.11446412729612909,0.6488784646648738
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Quality and Optimization,0.47867086156403527,0.4569130951293064,0.5137250408199874,0.3600104501746343,0.18131472028940732,0.18494101469519544,0.24423497917982762,0.09778624199365132,0.718006292346053,0.6853696426939597,0.770587561229981,0.5400156752619515,0.271972080434111,0.27741152204279323,0.08984086452432449,0.11332362941235952,0.40616474732107216
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Specialized Summaries,1.1060197937653844,0.1849410146951953,0.8145060157000819,0.1609671794569294,0.19944619231834804,0.10516253776785622,0.15165187982044126,0.021606060280063066,1.0389882911129371,0.1737324683500321,0.7651420147485617,0.1512115928231762,0.1873585442990542,0.09878905063041038,0.09988220855823848,0.043484253805518014,0.7619708156291778
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Specific Character,1.515791061619445,0.6599855818534426,1.117301598583392,0.1923950643070933,0.06527329930418663,0.20669878112992435,0.12674846358426883,0.06924617211644568,1.8072893427001078,0.7869058860560278,1.33216729061866,0.22939411513538044,0.07782585686268406,0.24644854673183286,0.2019224846219575,0.09024391090590228,0.7798873418988969
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Standard Summaries,1.0226150224322572,0.8558054797660023,0.8118870252959016,0.31578980450405103,0.12329400979679697,0.24930774039793505,0.12505938756575374,0.1129601875680471,0.9704407865938769,0.8121419348799821,0.7704642178828455,0.2996780797844566,0.11700349909287877,0.23658795772457103,0.08681465277078915,0.055918104633606025,0.6156837905893673
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Supportive Conversation,0.13054659860837325,0.0,0.27620275724086374,0.0,0.11604142098522068,0.0,0.4068176136860061,0.0,1.1037121518707922,0.0,2.3351687657636666,0.0,0.9810774683295931,0.0,0.3123241941766598,0.0,0.9999999999992446
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Technical and Practical Support,0.64548040423029,0.5620756328971626,0.6017634105605106,0.2906072044638556,0.07977847692733922,0.223017105955971,0.1413045234072351,0.09465992750374558,0.9528520252923329,0.8297306961815258,0.8883174155893253,0.42899158754188205,0.11776822784511981,0.3292157278397667,0.10618841562391024,0.07665357789902905,0.5606273295902862
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Technical and Scientific Translation,1.6300193354017718,0.40070553183959,1.1865663399939497,0.3481620368557221,0.1541175122459962,0.13598604021705546,0.12865723954564356,0.016233628304742065,0.7278225869700935,0.17891967933302633,0.5298156680903218,0.15545839785185744,0.0688152612819332,0.060719348189941055,0.04524678222857326,0.024721484260542437,0.6826340665908025
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Textual ExpansionReduction,1.0226150224322572,0.30098243568041616,0.771091213230785,0.13565237076652292,0.4041051828450165,0.5553896525864908,0.15619725839761112,0.08922257927763369,1.5983730602722677,0.47044313476098654,1.205235005554,0.21202807531574175,0.6316265883123788,0.8680880284124983,0.17108910660536641,0.13248391485958905,0.7507305152266666
rb_RLHFlow_ArmoRM-Llama3-8B-v0.1/ArmoRM-Llama3-8B-v0.1,core_eval_runs,ArmoRM-Llama3-8B-v0.1,Tone Adjustment,1.1368432962145838,0.12510715699969133,0.9877623039766267,0.011080344017686095,0.5657019273029508,0.3118613188977805,0.21913359689049877,0.09283007070183066,0.8959866656606466,0.09860140339806156,0.7784906294053076,0.008732813505464398,0.4458498240608002,0.24578900557197958,0.09441211768973479,0.019362745133626236,0.9789529367092534
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Analytical Reasoning,1.4624933784748613,0.8367386891189836,1.5248015187557784,0.6262545025456998,0.6841814232084045,1.0200194718079314,0.13600205619648698,0.40718623482572713,1.172282082452968,0.6706996335069075,1.2222260463145047,0.501983081219427,0.5484152170389949,0.8176109158186743,0.2156940401534257,0.16863459771333322,0.5836098637557827
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Casual Conversation,0.34505960195174595,0.12852339166855853,0.40359772274371875,0.24148688471911034,0.1365432513086766,0.1374686197286902,0.21190885808189108,0.31047284722846213,0.5358880181826358,0.1996007219095039,0.6267994936550179,0.37503644975316375,0.21205580695665682,0.21349293215440526,0.053845125239735225,0.0779044915679308,0.42888537957376954
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Content Categorization,0.4869494263538346,0.16286484192239742,0.5384958746390377,0.22542431683591085,0.2681512043772805,0.06415887712094445,0.17851705201578827,0.21405014673912737,0.7591226798671946,0.25389576117179824,0.8394802608441273,0.35142193879362527,0.418030394656597,0.10001954227979931,0.07504650704685134,0.06815466596786679,0.5637894504610366
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,ContextBased,0.7871800692915873,0.40592828024597516,0.7333030635041276,0.4054141866793012,0.2743203271773713,0.4883888883405225,0.1766931372184346,0.132713572140778,0.8891014556736936,0.45848648733016484,0.8282486392195378,0.45790583068461,0.3098383860681053,0.551623813277174,0.09427791823400888,0.09805649534327193,0.4858012010053433
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Creative Writing,1.1211352502031697,0.6806598822766861,1.0158774485019908,0.539626880485721,0.3070166780178526,1.0413479286553284,0.22181220838521332,0.313875071808758,1.1862334260214185,0.7201820689895261,1.0748638809956548,0.5709600541913437,0.32484345287050215,1.1018132922546704,0.18766444106461966,0.18474123569581952,0.5015563662050933
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Critical Thinking,1.3226599283394695,0.7789545722247997,1.1578301065622285,0.792538066686481,0.3779615902188969,0.6539270168096258,0.1867771546268162,0.5067934210937246,1.1712539322228563,0.6897869862033861,1.0252923189859906,0.7018155666122186,0.33469600861716575,0.5790714403713749,0.2273404849881051,0.1967485891894214,0.46561005260881244
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Data Management,1.029832232761826,0.9798623380810902,1.0444553386583373,0.481425776587457,0.6884741044901344,0.7796743032181435,0.2686323891528913,0.2446123413683846,0.9607081124740583,0.9140922835341229,0.9743496902159688,0.44911164596327047,0.6422625320613313,0.7273412157438881,0.17080908001733058,0.1455656914026025,0.549757675265817
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Development and Implementation,1.0240743848150744,0.2623933564305292,0.8400060393427353,0.1365318270071949,0.19000898224279694,0.5872040980248936,0.14480022597661535,0.12258263883840978,1.287946312190738,0.33000391452919287,1.0564493132838082,0.1717118069722392,0.23896835190045013,0.738508221442351,0.14692575195733337,0.07086378537312041,0.7841644299338543
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Discussion,1.1055068057762731,0.6557777536496531,1.030186386107757,1.097486946136155,0.9179654726535125,0.8408771423307112,0.25833382058504584,0.29703528540744506,1.212729873895042,0.7193816160437664,1.1301041292419538,1.2039321683366515,1.0069989131449901,0.9224337882423854,0.23551567310766242,0.4559101390454963,0.2530113437711453
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Documentation,1.2251877880980349,1.129772022123297,1.3897091537352713,0.8766209255914225,0.3150365376579707,0.09623831568141661,0.5868973540397947,0.33183856270089035,1.4290952862594433,1.3177995137142298,1.6209978749116964,1.0225165846158841,0.3674679386622134,0.11225521886025841,0.32744003769764823,0.2384000112612393,0.5086728067761659
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Factual,0.6514593676895895,0.6522819173962683,0.7521760095518127,0.6197683553794933,0.4869494263538346,0.8040423382785021,0.2568484172277198,0.534778810634422,0.8786129630024069,0.8797223228041776,1.014447907619221,0.835871795084185,0.6567410026482637,1.0843992062308747,0.19160909480952237,0.3659989109167668,0.3162257181972067
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,General Character,1.010913589508214,0.7557175430111241,0.9497935765813883,0.6137934457045906,0.36546911654871306,0.22373352021662665,0.22601314487920338,0.37718147075482567,1.2521890383636487,0.9360851741225404,1.176481469481478,0.7602879538939038,0.45269588454674425,0.2771321549511086,0.16101898835542444,0.2665414277633734,0.3947222449528749
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,General Explanation,0.8998693791065794,0.7859462447315693,1.0065266577392604,0.6670535392120414,0.6642088881431105,1.848423418977209,0.33708681765696613,0.562124873099942,0.8868904938310038,0.7746104815864022,0.9920094463295597,0.6574325747041754,0.654628952256431,1.821763465818884,0.20595950158246934,0.4011746918767031,0.37196022912563736
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,GeneralExcerpt Language Translation,1.141698992870139,0.5307501982344796,0.9142525746719763,0.37576914436133824,0.37970950834558936,0.4049257977909605,0.1181303900459666,0.07801127544302655,1.3849011451975062,0.6438094120595756,1.1090046024127522,0.455814642568487,0.4605943740286735,0.49118218075234865,0.23159754042723202,0.2163889079764952,0.5650228390160965
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Human Decision Making,1.3259501271661847,0.5492575666347517,1.1900694853434437,0.3804920729970823,1.3666149282901165,1.7563492611858533,0.22949834961690857,0.4889559876582628,1.2613446685339578,0.5224955970307383,1.1320846612315822,0.36195301607611086,1.300028122039322,1.6707730790863107,0.24776140272163188,0.38112699188099414,0.5889525449257549
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Hypothetical Scenarios,1.240404957671592,0.2165362102831876,1.253611450184379,0.334263637051587,1.6047944777302894,0.7640458587912469,0.47332399785878687,0.06176976983222743,1.070665331884953,0.18690493940233022,1.0820646201591482,0.2885222972445278,1.3851910228829865,0.6594922149566552,0.24239966080045994,0.10817018763474862,0.7161841759186249
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Idea Development,1.1593838115637327,0.9522041041940165,1.0978982209894945,0.988093547298619,0.9071180983966861,1.2614827939052358,0.27840708243657303,0.4349179974972232,1.1467970150570097,0.941866544558617,1.0859789399413575,0.9773663555909136,0.8972700128893638,1.2477875645383514,0.24302102790284247,0.38725335074797623,0.31654284699666924
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Interpretative Analysis,0.9434645135605545,0.15628444426896715,0.9731791217143251,0.5480066056225109,0.918633794290189,1.1643962238388066,0.35466649372334214,0.44513197012921935,0.9199059466345478,0.1523819789542843,0.9488785728962505,0.5343227308091071,0.8956952572151666,1.1353209316858757,0.1911276538953804,0.3043180260119911,0.4351877387978547
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Literary and Cultural Translation,2.803866312641273,0.45086005797330353,1.8949146138567818,0.29596616540030585,0.5254036251410673,0.2023472278429786,0.10408148356447045,0.011553792324282774,1.273073298098474,0.20470943938987207,0.86037097639123,0.13438109392483422,0.23855535582263301,0.09187415660644649,0.09487497573844617,0.031363241960548904,0.808848003084366
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Logical Deduction,2.244738149526376,0.37158682999213655,2.0383067339037075,0.6401150363183112,0.28624729792421355,1.0387774608219573,0.25878336731994855,0.1530405742044847,1.2800314899941783,0.2118923508995494,1.1623167745487066,0.3650169191801218,0.16322863998460024,0.5923487606912414,0.13642748854407494,0.12520747111321784,0.6262809390646965
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,OptionBased,1.5299424544225209,0.6761358588899526,1.277842393626958,0.37033015682915504,0.2574580581904565,0.2599257073104928,0.10685515534992018,0.025656125752473447,1.0622801123001415,0.46945927543586896,0.8872402733057626,0.25713016816249534,0.17876004040319585,0.18047339542303475,0.08581668683017418,0.03877286677428615,0.7024972626604205
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Paraphrasing,1.5488610976761326,0.0,1.0010429930280687,0.0,0.6345970987026747,0.0,0.09101774620458203,0.0,2.5001301182961195,0.0,1.615856799769717,0.0,1.0243496475121914,0.0,0.24307555423750293,0.0,0.9999999999989241
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Personal Opinion and Advice,0.9212556714802276,0.0,1.2984518334998538,0.0,1.1520322735602913,0.0,0.4819915100426483,0.0,0.9101562055587791,0.0,1.2828078355058796,0.0,1.1381523666499263,0.0,0.2556757325640362,0.0,0.9999999999987
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,PostQuality Assessment Rewriting,1.0413479286553287,0.613622081182366,1.2510067094465631,0.6310784138463265,1.4304139399143891,0.5636521865016303,0.3012551207555114,0.2196194464233926,0.9747777414353533,0.5743950988236758,1.1710336777924448,0.590735501545648,1.3389719528878985,0.5276196266339462,0.19555273350306113,0.1954764958870019,0.5114389727557277
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Problem Solving,0.7855349698782298,0.9294811685470153,0.7440419068968782,1.3886923909034046,0.19905702901626346,2.411715739982167,0.22140787984023946,0.37385879872483774,0.8011675065922245,0.947978306229543,0.7588487110142291,1.416328060374119,0.20301836292703487,2.4597100830663896,0.08782265151238033,0.7028225933298554,0.04846635062336812
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Professional Content Generation,0.6818937068367041,0.5455560929546972,0.635693831644913,0.43669392413605757,0.14641384778882188,0.2815176371108106,0.18123039232751598,0.10364959739881857,0.8833378192829344,0.7067235327375225,0.8234896397295872,0.5657014499076893,0.18966722778330797,0.3646831949934672,0.11611988287873354,0.11765140627004159,0.440973853312046
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Quality and Compliance Assessment,1.0709597180957646,0.34135812827169154,0.7035541824459118,0.24489132656064172,0.2039923272563361,0.149704046615537,0.1401511790974691,0.05706831733781764,1.5967035797064126,0.5089339366959764,1.048935326555723,0.36511070505404786,0.30413401518217387,0.22319512404498246,0.11801784540279336,0.057614581363477135,0.6660484643384853
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Quality and Optimization,1.0503959754287953,0.6687329115298437,1.1914175529182784,0.1984629653392178,0.7569513675711423,1.503003951528791,0.30582868234264493,0.38753330387713997,1.0686410668134145,0.6803486196705608,1.212112150611648,0.2019102128761272,0.7700994062138172,1.5291107199176284,0.160881036287997,0.30844593697784767,0.7360023425864376
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Specialized Summaries,1.1030391566562368,0.12914030394856768,0.6868747022827033,0.12333675879588968,0.10117361392148927,0.09459321626805905,0.07540139064987794,0.007747072903753094,1.5594691525139899,0.1825776710996989,0.9710987170203738,0.17437265898729215,0.14303855761314002,0.13373523679277316,0.09511509250437233,0.025292628472906697,0.8070509600408529
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Specific Character,1.0150263380416078,0.35040617504515814,1.1484621793472758,0.15153193485260097,0.7016348997969948,1.1933139869642322,0.28205656610930285,0.31807773551740026,1.308681756594526,0.4517815464418704,1.4807216777747896,0.19537136254580645,0.9046236129458108,1.538549480048224,0.2675938316137305,0.25499520385829955,0.7931111099915843
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Standard Summaries,0.7353594377708245,0.6267828764892263,0.6145360253008978,0.3796295382352177,0.2730865026173532,0.2963749411876959,0.17670987937470461,0.16169564107195622,0.8867569690765825,0.755826409884067,0.741058148156965,0.45778856081305674,0.3293101943326906,0.35739331143222164,0.08709647146194321,0.03816007901374552,0.5988990519715746
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Supportive Conversation,0.8653222914260709,0.0,0.9055815298474041,0.0,0.06004612858755055,0.0,0.28368934260928536,0.0,1.6735006579466465,0.0,1.751360505836961,0.0,0.11612694679667794,0.0,0.37923256940185823,0.0,0.9999999999990613
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Technical and Practical Support,1.102216606949558,0.8390007008123499,1.1160628603453173,0.4100981502867773,0.3337495434849128,1.6362570040107525,0.3081899975450906,0.45572952454594967,0.8913388734700568,0.6784818290592969,0.9025360409104144,0.3316375574311218,0.26989608053020564,1.323205861231575,0.18129971351195528,0.2023782782059061,0.5752804926489847
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Technical and Scientific Translation,1.7651916705326505,0.5363024087545611,1.3052835657858803,0.3486125597125391,0.6859036366567632,0.6044712156955645,0.17254921359356284,0.08020585740796266,1.53008157488031,0.46487100970268513,1.1314297293281417,0.30218002004680966,0.5945464926623106,0.5239602503915041,0.15683600393278857,0.0011396620269931712,0.9859514753574292
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Textual ExpansionReduction,1.5628444426896717,0.8404401627990379,1.5076764908347857,0.4456777092136901,0.495740426343964,0.43643973342809106,0.2925505540176196,0.3393213437965191,1.4098266699730813,0.7581528421289452,1.360060200753052,0.40204149786053445,0.4472025848207376,0.39370801035317343,0.19670499202345393,0.12361705965596656,0.6689275486387795
rb_Ray2333_GRM-Llama3-8B-rewardmodel-ft/GRM-Llama3-8B-rewardmodel-ft,core_eval_runs,GRM-Llama3-8B-rewardmodel-ft,Tone Adjustment,1.6796465010380577,0.6448789700361592,1.3065116781951578,0.23244455009638476,0.19131992083781624,0.025576154942043128,0.16906888188485125,0.07455075778567011,1.0992100006793355,0.4220277377730651,0.8550196138228486,0.15211854036634898,0.12520537516920138,0.016737786953292394,0.09399938839482769,0.02104945449706286,0.8201353210711545
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Analytical Reasoning,1.0047488642131281,0.6779371057761119,0.8202698286194449,0.37722220862600186,0.2476167303501807,0.310301217760258,0.12927962151954692,0.07575361713546827,1.2999120714658692,0.8770934298922166,1.0612389723045021,0.4880380760638925,0.3203586372114888,0.40145783003498536,0.1345167765616349,0.10425502883459586,0.5614472420105546
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Casual Conversation,0.6049490478685087,0.28101022991445135,0.5560300767837022,0.555965297690165,0.3458730682874307,0.6075810987945117,0.10808856625270918,0.23459969721448357,1.158754471558502,0.5382632828572202,1.0650522388030559,1.0649281571724387,0.6625053231868763,1.163796054466388,0.15539982332936203,0.2306394816408642,0.30276007807410643
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Content Categorization,0.7681036925797622,0.19919054015182602,0.6030884346647817,0.024977636187920926,0.2005146937900623,0.09893319325679488,0.1063470565429766,0.07749607924806345,1.2635615819398915,0.32767647969035973,0.9921048211541463,0.0410892198535433,0.3298547658801342,0.16274909675029264,0.10602247648030816,0.026662304683576776,0.9313506105675405
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,ContextBased,0.9614774149636218,0.8969722163009697,0.7655099015493083,0.49268384096562734,0.1254162660215201,0.249449264403097,0.10515834129149121,0.08141934990730909,1.0755561192116438,1.0033974183801706,0.8563371807954674,0.5511404758346055,0.1402968288893453,0.27904626628358753,0.08814050319794359,0.09458522984476786,0.4472556970902025
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Creative Writing,1.27355796417526,0.5407748876154155,1.0119397541096613,0.32063467366975795,0.18594900376946344,0.3534544390063504,0.1019192221099271,0.08846862857030446,1.1507932377122176,0.4886468470992693,0.9143937368847596,0.28972706748700716,0.1680244340003784,0.31938316879929507,0.09975497386403881,0.0460906900081052,0.6866628807815236
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Critical Thinking,1.1928732632454084,0.5554351600387456,0.9280635539098742,0.10678632417800649,0.46837205832471157,0.6444844922101343,0.17956947218304475,0.10614796130141646,1.2480454017779958,0.58112485108954,0.9709878548737956,0.11172534834131576,0.49003495318780804,0.6742928455213497,0.11503844862525359,0.052241366756029195,0.8322642268869485
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Data Management,1.1306380422483042,0.3362877329106446,0.8969406888333926,0.21169668193239777,0.19919054015182602,0.27996391208423793,0.12582389660518106,0.0354721422217254,1.189212763832333,0.3537097190736038,0.9434083019532642,0.2226640063466685,0.20950996157193355,0.2944679421903719,0.08869301227157284,0.05734935233438662,0.7076449164116432
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Development and Implementation,1.2460285735803212,0.4072038450584181,0.8466596326259213,0.3419189983954751,0.23127762027837256,0.17913907077281982,0.09263716496991581,0.029842639809385646,1.4926492970510106,0.48779983538935756,1.0142350924178736,0.40959345830803484,0.27705333942720384,0.2145952458338101,0.1236394217228643,0.060392631802830776,0.6426611876081848
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Discussion,1.4418141472338253,0.40712995255628437,0.957863578992602,0.08024791413946097,0.17422078583079942,0.6438224153910161,0.09865951528799666,0.0899931794617519,1.3602245187804232,0.3840912123517668,0.903659829037041,0.07570683128813749,0.1643619498552571,0.6073896811426358,0.09924673609231138,0.026580731998519985,0.8801778652996021
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Documentation,1.130898143855815,0.0038542329112949414,0.7368875587450316,0.04200640961297697,0.1639113039331028,0.043602487659065436,0.09241989411951346,0.015428315692331274,1.61189574543982,0.0054935288959518935,1.0503031835907675,0.05987272443962266,0.2336266399186408,0.06214765204990963,0.12338088868995328,0.021332816714671965,0.9178315281258884
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Factual,1.1472845451289886,0.7411477078013813,0.8717765151289742,0.4926049607195998,0.27698456639820634,0.3830587310612039,0.11494716893330115,0.08269988389880467,1.2295721239030808,0.7943056193655847,0.9343036179010366,0.5279364481712722,0.29685094516489463,0.4105331493658267,0.12229851381921286,0.09944586792083548,0.5133645444917659
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,General Character,1.8257241199196095,0.5987066092882518,1.2567360897561781,0.1341316403398518,0.22151198719638013,0.28833445472594565,0.09100811183984092,0.1155543970945766,1.398163136831692,0.4584972624019381,0.9624247465714931,0.10271974443444698,0.16963674384602512,0.22081025346481964,0.11041277170274566,0.039448798527157714,0.8484357900556239
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,General Explanation,1.398306241977491,0.5952543515885647,1.2028684557006961,0.572590751470589,0.48274858353984806,0.5880187977796307,0.1463285371377412,0.10393036934562838,1.2647985017886838,0.538420547281323,1.0880207603647374,0.5179208265378789,0.4366566256175218,0.531875831007863,0.15937137880644558,0.12733379499935815,0.5506837437800784
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,GeneralExcerpt Language Translation,0.8212590029146749,0.8550722118910651,0.7482867087408602,0.6013064524664997,0.193326431182494,0.14140069208308637,0.1296309898424039,0.14990524792135854,0.8429192954138278,0.877624310743834,0.7680223937403945,0.6171656072469381,0.198425318418273,0.14513006410729848,0.06527884956024238,0.09147410447913987,0.3600872098695898
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Human Decision Making,1.3974550003529105,0.3009080028890194,1.0096586104660132,0.21453037728644586,0.1626817326975977,0.3181752027876271,0.13695401570993904,0.10359892828482509,1.1897804406956416,0.25619032898786065,0.8596141315527408,0.18264920642597504,0.13850574335001717,0.27089146548224285,0.10190245758613548,0.03927631425527961,0.7603785611437889
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Hypothetical Scenarios,1.5814177736650197,0.6009765869538,0.996916915809195,0.42011664190906606,0.4108659574641654,0.3276334430607434,0.08208646650418783,0.01129976146107703,2.2112716663624847,0.8403361344537816,1.3939732854635927,0.5874425103164758,0.574507423365947,0.45812470408371087,0.17509301585068193,0.13567599363615596,0.5793037836279522
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Idea Development,1.4697159560395179,0.5367078442979758,1.0633971789511012,0.3072006062679469,0.3509952965353402,0.4182552076775373,0.12791425629197145,0.11001226516851348,1.3065210560342573,0.4771126669970005,0.9453192635745501,0.2730895441845296,0.3120213423607136,0.37181282096116164,0.11784314746947838,0.05841760141378857,0.6941686893177711
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Interpretative Analysis,1.6375997208873296,0.15048060274527786,1.392258222780626,0.38528667210331546,0.6000307629264883,1.1151265282003937,0.13783545680495746,0.09344655701981402,1.2852886869445048,0.11810640527484728,1.0927296336346402,0.3023966080163736,0.4709409397005855,0.8752196845948775,0.15417706448792523,0.12996734044502392,0.6592320975617785
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Literary and Cultural Translation,1.5776344775557731,0.09827111643767683,1.112530766703216,0.15837297879540024,0.005674944163869687,0.036319642648766,0.09922042822755195,0.01023116535191243,1.0315886546918767,0.06425783046911637,0.7274651595500009,0.10355742756599107,0.0037107505564455997,0.023748803561251836,0.06670297265996816,0.0043231377762118495,0.9147816771468796
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Logical Deduction,1.1764159251701862,0.21432372458881188,0.8429446460297584,0.19945868783179133,0.1322261990181637,0.2235928000564657,0.11475933147104378,0.021743925341832337,1.4250579355138255,0.2596222288048182,1.0211056577203526,0.2416153843384512,0.160172937276759,0.27085037462250233,0.12737034220091814,0.05428202053575071,0.7376456273237033
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,OptionBased,0.6195147378891076,0.4113388694778212,0.5830059304348792,0.3260018966136308,0.16873500647239204,0.0832325144034221,0.12660417108311828,0.028938289326890576,1.1091926875650762,0.7364700760642011,1.0438265230765085,0.5836809001320498,0.3021068327658162,0.14902130764233082,0.09653304445004207,0.05995465024318136,0.5551594078105536
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Paraphrasing,1.9767722170812745,0.0,1.1606942280050954,0.0,0.042372916423560336,0.0,0.06750467785477732,0.0,1.8626925043685043,0.0,1.0937104536814664,0.0,0.0399275713855067,0.0,0.10732328441410288,0.0,0.9999999999983348
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Personal Opinion and Advice,1.1160723522277052,0.0,1.0008604765452915,0.0,0.049182849420203956,0.0,0.1806310470790965,0.0,1.2638334199005674,0.0,1.1333682053773102,0.0,0.055694354097313145,0.0,0.16257484054230387,0.0,0.9999999999984567
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,PostQuality Assessment Rewriting,1.1800100564739704,0.9921694046498837,1.0721650990487281,0.3127971422767747,0.09193409545468893,0.10933725755722264,0.13293009942169787,0.13840068275704942,1.342739223748205,1.1289944258671585,1.2200219183965604,0.35593340048847133,0.10461225757320097,0.12441540098211967,0.13227925617537784,0.10641050116531348,0.6516852530584841
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Problem Solving,1.8787848478517912,0.8724280827922335,1.5256772109887884,0.5353613587035388,0.40216437641289854,0.236077677216979,0.10252590555702346,0.07046591812521974,1.3235290374635076,0.6145908095833363,1.0747787820669563,0.3771407378499074,0.2833087730212865,0.16630731360798,0.136320099443336,0.08426605587532421,0.6391143162150298
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Professional Content Generation,1.242434442276537,0.6049490478685088,0.9102873167743454,0.2805360042563133,0.3103721545623064,0.2434551046300096,0.12026908711033218,0.0598875667128822,1.1785196350737634,0.5738285312067444,0.863459221567844,0.2661043336462934,0.294405616815968,0.23093099426612873,0.0973919967830636,0.03922119475281308,0.705278469050208
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Quality and Compliance Assessment,0.7854122722795648,0.18443568532576482,0.610681792389603,0.09521426572718483,0.04029210356347478,0.07245012049206967,0.10696862203137203,0.01613346061045212,1.1047280604918435,0.25941952287561354,0.8589594737629894,0.1339244048258359,0.056673188074364805,0.10190531001165129,0.07914014190278895,0.019946983299653054,0.824767909691832
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Quality and Optimization,0.8614565240754186,0.548294188632543,0.7456535083759351,0.2489080428763576,0.33747001294478407,0.11425554249924302,0.15785006215800435,0.15960484559325044,1.3291349712922607,0.8459590940471271,1.1504633452197939,0.3840383991263576,0.5206800151043902,0.17628404098825767,0.1789740839542212,0.09492497465679528,0.6638225125766105
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Specialized Summaries,1.2560188398688,0.39614657103912854,0.900544344059674,0.2717625011696361,0.42553805268783684,0.3700891190866934,0.09791345969495618,0.011110918842433815,1.1770018859151619,0.3712247351803921,0.8438905195237254,0.2546657472360011,0.39876717979371046,0.3468065743084077,0.08890094216244832,0.04846907660745364,0.6780689328130857
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Specific Character,1.038609364390884,0.23350030674255473,0.9563331831706325,0.08923324239892128,0.806031236074958,0.5868838089468569,0.26960365094533545,0.11351588609908725,1.1760214401099545,0.26439330937723815,1.082859798743343,0.1010391489135134,0.912672316967219,0.6645308292398021,0.14502727947599003,0.07363043178266215,0.8541549220103889
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Standard Summaries,1.2234233793275735,0.3125002586237575,0.8335021694904308,0.1116959062253311,0.13884696720934503,0.19975803456821303,0.08453462503622433,0.03835619888528069,1.192039619323645,0.30448387338579996,0.8121208288261558,0.10883063687244054,0.13528520766657218,0.19463375925871962,0.07348886203892391,0.023950201123024317,0.8270245241497147
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Supportive Conversation,2.134535664836852,0.0,1.3978438391196941,0.0,0.13317202304547535,0.0,0.08728478829370401,0.0,1.4563631324022006,0.0,0.9537288440224186,0.0,0.09086136522608554,0.0,0.09843838014007311,0.0,0.9999999999980992
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Technical and Practical Support,1.6006180014194453,0.25726413542875926,1.2017587545242079,0.1701261559792293,0.357899811934715,0.3693915738665511,0.11421962756316606,0.04430440464948193,1.2744271894850019,0.20483613752875995,0.9568516851672816,0.13545605424293872,0.2849632148561867,0.294113064381828,0.10420092008375414,0.027904986516486374,0.8309233550980916
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Technical and Scientific Translation,1.66370446404113,0.31798603798216485,1.2111907219076765,0.11415045094065279,0.49372014225666283,0.17365329141441244,0.12011325744772516,0.026381711574509636,1.2216116617785533,0.23348825508240467,0.8893434756594027,0.08381748386228693,0.36252489337601185,0.12750875560121794,0.09558107772916158,0.01069078115309352,0.9027622457778752
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Textual ExpansionReduction,1.1878603959006568,0.741620619815037,0.8645875956991647,0.2799310709721784,0.08134086634879885,0.16079008464297448,0.11522970863247706,0.09914909369556604,1.2968354942232945,0.8096573859546822,0.9439054335317925,0.30561213255822817,0.08880313122318921,0.1755410733481647,0.09058709445362756,0.09281400300577647,0.6113693097438591
rb_Ray2333_Gemma-2B-rewardmodel-baseline/Gemma-2B-rewardmodel-baseline,core_eval_runs,Gemma-2B-rewardmodel-baseline,Tone Adjustment,1.6881067239457697,0.6181905842508715,1.1914526798918876,0.17949978933740263,0.04653454214373144,0.0041616257201710985,0.09676519813626178,0.026361058488969802,1.8037625224959948,0.6605441420346156,1.2730816487082743,0.19179770343307645,0.049722723054013304,0.004446747590196316,0.12763364468823768,0.028646255895346084,0.8350271687388088
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Analytical Reasoning,1.1449872159969985,0.4407275989414221,1.0964225012517472,0.28487417443756535,0.4633411302089021,0.6826586319642522,0.18170681028008645,0.104196574491012,1.0722317021782577,0.4127226024950976,1.026752917760774,0.2667725164929854,0.4338992102199953,0.6392806982743025,0.13771434426377954,0.08209841679868346,0.6963453005928775
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Casual Conversation,0.933972592886126,0.15073437409039614,1.0502728966451786,0.38364249563237085,0.22222013412699676,0.2701473197983721,0.2073455068806564,0.06484933449143346,0.9283977465298618,0.14983464642968936,1.044003858264885,0.3813525483842668,0.22089371072393071,0.2685348208739886,0.1275169339599772,0.0558382370813407,0.6714605088854418
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Content Categorization,1.2898488222656905,0.4150089260670644,1.2107652157368987,0.22864792726007477,0.6760180166256302,0.5878168067976572,0.1946632408952328,0.11548437750274432,1.1156656364334125,0.3589654769098085,1.0472616028084394,0.19777095647356902,0.5847274949898976,0.5084371133589155,0.16305593497425186,0.06727945309948458,0.7717427417367259
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,ContextBased,0.9364027037089,0.4195653838597656,1.0875120949015757,0.24167227122225687,0.34858589691124275,0.7572157820685255,0.23931863734596387,0.22442428279651777,1.1057121029850308,0.4954262958552089,1.2841433292669047,0.28536862846286803,0.4116131271492718,0.8941266951582022,0.1726528890928633,0.08219442072891223,0.7385938062849047
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Creative Writing,1.1991921879605398,0.5979928958680232,1.1679063862028363,0.44769166653076053,0.34426569989297795,0.3530411000863284,0.1909936142209121,0.16131120698386414,1.2695834225579268,0.6330944072381105,1.236461179374734,0.47397066454496295,0.36447370982524213,0.3737642161541209,0.1614379196319089,0.12422462581303495,0.5994570720813873
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Critical Thinking,1.1329716680399493,1.1192010400442303,1.2855286252474258,0.44472153108070334,0.7355135423595858,1.0622790691746706,0.25727581095577357,0.3805387099757781,1.1155550755868322,1.10199613639357,1.2657668529239534,0.43788505504122854,0.7242068698518902,1.0459491977920092,0.20147220503409824,0.1605601532881581,0.6172653998292951
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Data Management,1.0180814285854694,0.4424826789800924,1.0644091663156208,0.3179024663740766,0.4824107498598366,0.5986341751129218,0.27878090794520755,0.38653033176382867,1.2203298505789357,0.5303847082976345,1.2758608912938678,0.38105583541384025,0.5782447471928368,0.7175567031823795,0.21828643422418714,0.17027486080742946,0.6492438771876262
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Development and Implementation,1.290793865363436,0.6558261582961229,1.0949899359210378,0.3722044740128325,0.3140243207651242,0.5131246505365631,0.15949132948575173,0.0957880722257724,1.198662853411845,0.6090162614958048,1.016834520420354,0.3456382067226975,0.2916106889484313,0.47650014016369313,0.13696279644828752,0.06473532824450023,0.6733836999447995
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Discussion,1.0875420962697584,0.6705755809287928,1.2040700889837666,0.320530398718303,0.383957509998286,0.3084890683354724,0.1772332765006685,0.11183380516699354,1.017102267496015,0.6271425687608102,1.1260827713520962,0.29976972519592704,0.3590886783885131,0.28850830876151634,0.1263398483830695,0.0835054704258171,0.6841723184043611
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Documentation,0.8472648886679829,0.267885966671624,1.2143541294057174,0.07583970859407074,1.0210515640355262,0.40920366132377095,0.44764059221958624,0.10271726806281867,1.137885855856248,0.35977373373425636,1.6308906533631622,0.10185354412257452,1.3712832295487876,0.5495649171971937,0.33211316979965333,0.07666079388670999,0.9063703833291856
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Factual,1.1510624930539333,0.8049235827350669,1.0393974006790605,0.42584692032295446,0.45740085930878793,0.6793509811221431,0.18606160761830048,0.2381332345747489,1.1129684953159378,0.7782849272971541,1.0049989188683157,0.41175367020195064,0.44226334296626757,0.6568681056098755,0.1404630738155429,0.0817800851048196,0.618190532374585
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,General Character,0.8300178521341288,0.45746836238719835,1.0872120812197519,0.29728824503312645,0.440525089706191,0.6613107834169669,0.3152683956075201,0.29258306335195733,0.926445305911999,0.5106148209308405,1.2135191147841558,0.3318257533926495,0.4917031608963651,0.7381386671025842,0.15809380278463064,0.12710572723661495,0.6649204868830397
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,General Explanation,1.0427875552836712,0.7696025969568316,1.0068796677403675,0.31647318244301226,0.4287795540627835,0.5855217021317041,0.193327044728126,0.1784992714361101,1.011408780080998,0.7464442971066454,0.9765814055583142,0.3069501105582002,0.41587704369979933,0.567902625480488,0.14317309340684448,0.0843007271944044,0.6637557305162177
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,GeneralExcerpt Language Translation,0.8364643961223208,0.25976028360797376,0.8761979938475971,0.2668491783184995,0.42088169388876806,0.22870042965439397,0.2430146829695502,0.15381141921546002,1.1577313773666558,0.35952831025534127,1.2127257477612203,0.3693398884591126,0.5825327956971391,0.31653907166349754,0.20453301738499685,0.01430314977182956,0.8924868721049174
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Human Decision Making,0.9497008101557465,0.6684492339588655,1.012197410250697,0.3273983681751824,0.3275249364472017,0.24220104533647152,0.21340438679119145,0.10760231854581098,1.0476275662482024,0.7373752203264498,1.1165683951451717,0.3611574845226784,0.36129710366310885,0.26717518712762456,0.19529126803979596,0.10537161451048929,0.6673449079589581
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Hypothetical Scenarios,0.5778263511929198,0.5496100644173778,0.8246738584266396,0.18991241076558052,0.2540140840582894,0.10935498702482827,0.2952913841778748,0.10606931709152251,0.7559100420423743,0.7189976124192772,1.0788349297458737,0.24844263005625566,0.33230017385577737,0.14305774160147738,0.12286264156635285,0.04455833311662338,0.7545002780153599
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Idea Development,1.0054246013835215,0.7620422521748683,1.1410082845418081,0.30648888783352646,0.41797906151712144,0.535029399480734,0.19151886746845126,0.1433333662918108,1.0082499615640454,0.7641836796197593,1.1442146506565383,0.30735015728950177,0.4191536313407344,0.5365328943809207,0.15112949135203618,0.09110779699950308,0.6896071319491491
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Interpretative Analysis,0.7615022275475851,0.253153419808557,0.8712622330428756,0.15688639571861884,0.2823653769906523,0.5931833015312831,0.2660190459125679,0.14255859960412018,1.2517776190927812,0.4161403258578764,1.4322040359569652,0.25789403076753126,0.4641597181690545,0.9750904908942305,0.18010605442779448,0.15371672877349277,0.7427532685596464
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Literary and Cultural Translation,1.3776028241991949,0.21330972777682566,1.0771953744178548,0.3052237408520493,0.22249014644063828,1.0454876784200868,0.1575100202497508,0.21000634074186347,1.492869954926344,0.2311578330834596,1.1673267372900826,0.33076249862737117,0.24110639805160866,1.1329659869609572,0.11884381727491533,0.2366672301186173,0.6088945233914825
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Logical Deduction,0.9855449447916623,0.8636006336432966,1.033892149617591,0.11639988419317582,0.1036847284383557,0.2581317718413231,0.32510352476108606,0.3420068480329955,1.6926031434098128,1.4831724873433862,1.7756360190751066,0.19990849825682577,0.1780711252244844,0.44332290550678927,0.3495157979013339,0.11292377021896327,0.8462156326361144
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,OptionBased,1.140937031292375,0.908051410776537,1.1907168014490133,0.31273426193328113,0.7760153894058185,0.6339889124303624,0.2312916408210517,0.47507654250125475,1.21693454490536,0.9685364748590051,1.2700301323738836,0.3335654083302033,0.8277055690588107,0.6762187460508308,0.2470709769981062,0.07926652412592661,0.7491374130544378
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Paraphrasing,1.358296943773824,0.0,0.977737088722194,0.0,0.7212028897365835,0.0,0.11550353196200291,0.0,2.3085734074141944,0.0,1.6617705375934961,0.0,1.2257627613961461,0.0,0.27451749298311,0.0,0.999999999998967
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Personal Opinion and Advice,0.8664188871669304,0.0,0.7416525723239082,0.0,0.3964118279650025,0.0,0.17145714046348792,0.0,2.254494145982284,0.0,1.9298417975678364,0.0,1.0314966106839338,0.0,0.3722430615853438,0.0,0.9999999999991313
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,PostQuality Assessment Rewriting,0.9674541197776784,0.5944321084818752,0.986857504649642,0.1759205226795162,0.20682943224942832,0.45159559456549453,0.14624850613443663,0.1608207504785607,1.185020112144476,0.7281110178303276,1.2087870286802602,0.21548242263116468,0.25334228465047964,0.5531527037570851,0.18157067894551515,0.11067233032968915,0.7589264587859886
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Problem Solving,1.2440817351034477,0.689974278086978,1.2252858779371774,0.5919007430414857,0.3466958107157519,0.2856730278327613,0.12275958636456574,0.14343000527494132,1.073106413773623,0.5951504650073447,1.0568936889112808,0.5105552679964196,0.29904907982318657,0.24641271530602132,0.1569951388065336,0.09329200155261841,0.5868909928587392
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Professional Content Generation,0.8835309175439637,0.6073600027330646,1.1113294310673745,0.4161058510912331,0.4328297387674067,0.7389224478193104,0.30830129340085893,0.24760173875281125,0.8888683979157557,0.6110291126972192,1.1180430603327833,0.418619579551309,0.43544449755979947,0.7433863369523308,0.1519597970925805,0.15603043840489728,0.5776765371976711
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Quality and Compliance Assessment,1.1885942046501088,0.13662623070262514,0.9766195377573998,0.3164281803907384,0.12096551651141499,0.19710898895833248,0.12131480479643186,0.012808472669005033,1.1246622436424136,0.12927739556634732,0.9240892444386306,0.2994081797788126,0.11445903797178585,0.1865069145522404,0.09090867060223817,0.030657603272089884,0.7185751293847917
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Quality and Optimization,0.9577336764865825,0.7781079848365404,1.0033057547556399,0.18909862365363306,0.2569167164299361,0.42175923390810316,0.14268585107011023,0.21856468948151075,0.9374681519841235,0.7616433174457988,0.9820759307913747,0.1850973204885482,0.25148039092554086,0.4128348614037781,0.1254920523515226,0.04710545342885453,0.7780638536283403
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Specialized Summaries,1.213705349818773,0.5451548612422921,1.3887258314528177,0.29187581070447144,1.5254345659179442,0.08707897114940022,0.5534762053748473,0.26960706261787215,1.0739766078098816,0.4823934974790103,1.2288477247499605,0.2582733882889725,1.3498177632529313,0.07705393904753888,0.3048614169944025,0.05542633287065146,0.8228635146864711
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Specific Character,0.8060205077592358,0.7544481558536995,0.8000239842937797,0.24721930990369434,0.7326784130663495,0.3653941634354294,0.15250122714579029,0.33135361140245534,0.9026851505862233,0.8449278158809871,0.8959694744523029,0.2768678933599771,0.8205472655636589,0.40921525230293765,0.16239646461471735,0.08632703876903969,0.6764935096331334
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Standard Summaries,1.038737370579048,0.3774772144708889,1.1895129965506945,0.47861276415649656,0.4730615734999979,0.5754637434485563,0.2464848210453403,0.19556258118666342,1.0700065780232833,0.38884044608176493,1.2253210166530455,0.4930204886032694,0.4873021899393793,0.5927869819111313,0.1548217963087416,0.13778099221222034,0.5745510287576094
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Supportive Conversation,0.6256185307074744,0.0,0.921319515855201,0.0,0.2507064332161804,0.0,0.145920960964538,0.0,1.7451648126267685,0.0,2.570023618767496,0.0,0.6993463653534547,0.0,0.197211453584331,0.0,0.9999999999992772
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Technical and Practical Support,1.0620934357090421,0.6826586319642521,1.0053608484761338,0.31633020717276805,0.6221083706301342,0.48108666115578286,0.2180598667112611,0.1532857492049738,1.1595757877627566,0.7453152371707612,1.0976361011771776,0.34536401115278603,0.6792074780743336,0.525242342468158,0.1669971964368121,0.08977349598492979,0.6723677854555946
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Technical and Scientific Translation,0.6900502190501896,0.3092653537371919,1.015425503893251,0.18175284222804544,0.824482599704477,0.4694164072658371,0.5935923408662002,0.07711612854510563,0.8812604535722769,0.3949615816132439,1.2967959656060024,0.2321158486122994,1.0529435245714638,0.5994898698108693,0.3327885016878896,0.14379636776456406,0.7763690547140826
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Textual ExpansionReduction,1.3204277167855962,0.35911637714326305,1.0294341338142385,0.12910995043017937,0.6177881736118694,0.4289145602196042,0.11625748534430475,0.07425205342011754,1.2750937443399626,0.3467869086390576,0.9940907840316868,0.12467724513260559,0.596577779673807,0.4141887250925885,0.13242801147275546,0.059833796380753174,0.8217358308455001
rb_Skywork_Skywork-Reward-Gemma-2-27B/Skywork-Reward-Gemma-2-27B,core_eval_runs,Skywork-Reward-Gemma-2-27B,Tone Adjustment,1.65004524866352,0.3175344808424645,1.493807498597166,0.48690157965963876,0.2764926091689486,0.2214100971860721,0.12845248288461636,0.024597085568289823,1.3118437400259304,0.2524510290084263,1.1876292589101594,0.3871038020336657,0.21982130417060264,0.1760287787303654,0.12886398855497588,0.05779867209913164,0.6855500055031324
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Analytical Reasoning,1.296710059535303,1.0753435777836473,1.209814006391561,0.573035495455833,0.4051038104886273,1.0714074739118185,0.18090564114549335,0.6668907096488155,1.1650809343643536,0.9661853790321193,1.0870057054071827,0.514866623853353,0.3639816954977515,0.9626488293284878,0.20093730247196268,0.12124278272832212,0.5881892528591023
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Casual Conversation,0.37892871974096537,0.3776888470213392,0.469144220483283,0.293158829649441,0.07860399432042253,0.17234230802802708,0.19428658983428537,0.05650380462565738,0.8038279947364673,0.8011978314629449,0.9952036843527529,0.621882855270218,0.1667440018166329,0.365592695019593,0.1254625012820469,0.1253330137574084,0.46355439090224054
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Content Categorization,0.5567225316314754,0.456588049132149,0.6436666927114287,0.05263882911259132,0.15594843540185985,0.3440154783978433,0.23537962939823676,0.19615807392292472,0.8647568257829009,0.7092179849463836,0.9998071470543004,0.08176386654037038,0.24223462554806657,0.5343590679682236,0.14838747486890602,0.046567332723493005,0.8783169976135525
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,ContextBased,0.7711614705687122,0.321343520096109,0.8932703462947099,0.3817960620753741,0.29835667348462835,0.471929013972602,0.2130551823200479,0.2750811526944019,0.9136216094901448,0.38070675887492555,1.058288208939861,0.4523269717729266,0.3534734483429613,0.559110590668868,0.1458976911112823,0.14944098086663055,0.547088285664755
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Creative Writing,0.7848492717829971,0.948758477265631,0.8982735716606791,0.4589387778333799,0.4990586099091825,1.0325581286968673,0.3326913848341154,0.481447972577265,1.0275960514420417,1.2422008913832405,1.176101461184092,0.6008843900483496,0.6534129232429516,1.3519190169124025,0.28652881881506054,0.29752391185229665,0.5041642261629734
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Critical Thinking,1.1140748398824434,0.7488044005767244,1.04331353252664,0.7090191373300273,0.6332403908998286,0.43895430378635564,0.2687768047605341,0.3747429347902964,1.1801920410624103,0.7932438308780135,1.1052312495905354,0.7510974244119281,0.6708214230007087,0.46500500430780106,0.19937099898278376,0.09523387538607209,0.5706315157712629
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Data Management,0.6946436113003591,0.9581264044805836,0.9750691449244336,0.529604689338401,0.6086791027396166,1.065306512910484,0.37138738059173587,0.4897518150035856,0.8952627048167081,1.2348416115507879,1.256677562231129,0.6825591122520187,0.7844709013647764,1.372976263931562,0.40062381753195264,0.34063694349719364,0.5153757552165664
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Development and Implementation,0.7787778315607011,0.30071833580772567,0.7578836801744094,0.1883688109594902,0.7270771072042289,0.8002886392202461,0.42893481039095965,0.3930957493724418,0.8225552836306814,0.3176226209566806,0.8004866089416101,0.19895759027378124,0.7679483055381944,0.8452752787030408,0.12279832929235895,0.1633384289041723,0.6578166083771012
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Discussion,0.9279758488223744,0.6719519724792657,1.0052984226600794,0.45963784628144955,0.6829927438397456,1.2090136652709558,0.4568374793143709,0.5435534215330377,1.2767397948952395,0.9244936973428098,1.3831227435329683,0.6323849164093875,0.9396839549127161,1.6634008966746765,0.31740153139159166,0.34121872196449593,0.5264657319909711
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Documentation,1.7595958748623781,1.1184832762188917,1.6943877540524133,0.6332141502073496,0.9748942069745746,0.5642798510653868,0.41103121251923685,0.21353679704195938,1.561331550934223,0.9924569915745098,1.5034708240183383,0.5618660769445498,0.8650469723858901,0.500699022776329,0.3600184181182581,0.15620089818486393,0.6692820278414688
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Factual,0.7116475800266597,0.46209859455270935,0.6912738690411988,0.46392505608545653,0.380385078173542,0.6588176858719567,0.35031232097738974,0.44839427792652625,1.013529997617756,0.6581218015504677,0.9845137150839414,0.6607230519514948,0.5417452376647122,0.9382895499585978,0.1548406220975974,0.4375027258467887,0.33803305099949077
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,General Character,0.6219831338263981,0.7778528471508213,0.7948218282871502,0.2598233099416103,0.28141912651116474,0.21042411298797142,0.3481974540114441,0.31992365919973886,0.8136109145991723,1.0175027777691965,1.0396997595060005,0.33987269001217046,0.368121996327887,0.27525401527953264,0.17752169728774253,0.14765529872465633,0.625419288056884
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,General Explanation,0.8994784567903334,0.4465017829605875,0.9838160424173866,0.37244726869868716,0.8394922337836614,1.352602734515273,0.3293994251656639,0.42216170828080035,1.0380834345069474,0.515305398222833,1.1354170058502382,0.429839466353765,0.9688536448085144,1.561031819683036,0.31511323664775326,0.38517406542272276,0.5609625042494693
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,GeneralExcerpt Language Translation,1.3718699629678754,0.15938268603003047,1.0825750752859462,0.09723879084536469,0.1786991157810307,0.4034506468624592,0.13870258129790775,0.06828563522425868,1.4532865779808055,0.16884159914754004,1.146822854234667,0.10300963896675164,0.18930440455141828,0.42739430542996426,0.1576202327087997,0.0689173338182178,0.8615447926507902
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Human Decision Making,1.647495636592692,0.7226686708677807,1.2773313081396656,0.2881167899258468,0.6675238556234581,1.2938760647475864,0.26133871392285235,0.31862602973789367,1.3359273738863062,0.5860002528801744,1.035767149948246,0.23362921150688798,0.5412842205117037,1.0491830235662905,0.17117688257976404,0.25551267123467686,0.6757706932146721
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Hypothetical Scenarios,1.5165020997382272,0.19365631049398035,1.3303331334982145,0.3233241456971694,0.6845671853884772,0.8486239947663046,0.275482281855426,0.10712670605392177,1.0813407115976308,0.13808649037220566,0.9485930665623087,0.23054604530071132,0.488130130193781,0.6051107179725115,0.18763442166527322,0.09763880796095536,0.7179684189963755
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Idea Development,0.8820808776768498,1.0072489808010079,1.0782912822387725,0.7780430921712932,0.5913602457035695,1.2634893428570673,0.3663868466914526,0.5295665462439734,0.8853876476025362,1.0110249844778625,1.08233361131933,0.7809598422825259,0.5935771538411648,1.268225947703767,0.24670044707896593,0.3337098560838913,0.39019831969640456
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Interpretative Analysis,0.6001771183764661,0.20719650781307164,0.9062048209624143,0.6538437079444797,0.4387968596314825,0.548220547268325,0.41255131795227856,0.20775953484432808,0.743539264061533,0.2566887910558703,1.1226667012732743,0.8100249985061163,0.5436106844017556,0.6791720140247266,0.2147114324518884,0.30494670791812123,0.38407859890293766
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Literary and Cultural Translation,2.3536326711987914,0.2430937751241511,1.7030870902902482,0.13311028604775954,0.2739528294792894,0.38416373789049774,0.10862705551728002,0.018470382759249504,1.1228243166387106,0.11597034884541912,0.8124749548778868,0.06350161084951012,0.1306919734397857,0.18326920413395237,0.09269407171075739,0.018903448980920556,0.8931453203355437
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Logical Deduction,1.2935611764378399,0.5183455188811439,1.4520440253882827,0.33229026230853953,0.15933348473163267,0.9447436513163654,0.22076445278779644,0.26740969294341466,1.0781354240972192,0.43202182935541256,1.210225020381985,0.2769516505310864,0.1327985697646526,0.7874089109415,0.20395335391117364,0.14207504905218582,0.7085675749890398
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,OptionBased,1.4768261727101921,1.0345655416715005,1.1665299841476828,0.7459529119940216,0.22183881421627513,0.5394036745954284,0.11393187710756936,0.04729929251109466,1.0870658632114196,0.7615255636633518,0.8586622769911809,0.5490828651172488,0.16329166324785613,0.39704559140323287,0.09190327176667745,0.13549008442154942,0.4098305539637435
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Paraphrasing,1.1461934474765672,0.0,0.9578202630683302,0.0,0.04219503350600549,0.0,0.11148966879341637,0.0,1.136115855851368,0.0,0.9493988910191408,0.0,0.04182404524288003,0.0,0.08655909202977186,0.0,0.9999999999980694
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Personal Opinion and Advice,1.39637220957001,0.0,1.188002524130696,0.0,0.583389635363116,0.0,0.1662172919224656,0.0,1.143797045085645,0.0,0.9731171727296177,0.0,0.4778663858308966,0.0,0.12504534620363472,0.0,0.9999999999981788
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,PostQuality Assessment Rewriting,1.0721946946861842,0.5321612434712633,1.2465466024132172,0.4216004591603367,1.040390975401807,0.840122010403154,0.21893317387765632,0.40231892540735026,1.288799683511676,0.6396685653846497,1.4983741988603319,0.5067722690917178,1.250570970432475,1.0098436286664176,0.35890591612682277,0.08496363505226523,0.7569070013175697
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Problem Solving,0.8791681608116965,0.778482623770314,0.8964957647452365,0.6605744455653071,0.3770787509212058,0.6966903853137101,0.5846477541683137,0.1690963838529458,1.6754254452479127,1.4835496264377337,1.7084465552351393,1.2588527245559495,0.7185966943711947,1.3276786524394724,0.5278978624141419,0.3835771115869122,0.48109534365089407
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Professional Content Generation,1.0885688867929924,0.788165439295013,0.7950754883144459,0.45950513944604876,0.37782661065685325,0.5583018933100468,0.20654285604401473,0.19992483393257185,1.2019147193353557,0.8702321499844936,0.8778616989533727,0.5073505199450703,0.41716731960153597,0.6164343584897527,0.13521932775409728,0.18045578471289936,0.4463835022713847
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Quality and Compliance Assessment,1.2731721583817663,0.23986616994925125,0.7714107571472614,0.26214233113942953,0.16177386913216657,0.15764096006674624,0.13231227630644582,0.04426953406525597,1.4115604364667411,0.2659385797263438,0.855259752489355,0.2906360627850195,0.17935798534218467,0.17477584703052304,0.08893389348755576,0.03238973509518145,0.7035707343743487
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Quality and Optimization,1.4623413104618619,0.6875586243310672,1.4758323064825554,0.3737418095276439,0.7670679225420103,1.15785415519686,0.22059086232785075,0.23622103556914603,1.4501551328746798,0.6818289691283081,1.463533703928534,0.37062729444824716,0.7606756898541602,1.1482053705702198,0.35345649148110325,0.2784953041036895,0.6816147945635264
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Specialized Summaries,0.735933340915844,0.08781447738050219,0.6317720054831991,0.09631974183021241,0.2490372919706126,0.15401974450466366,0.17186823133873252,0.06324989405973258,0.5482189114671202,0.06541564911392983,0.47062599543374267,0.07175147677538274,0.18551537962520565,0.11473394665298398,0.03593578568119382,0.015803152069102433,0.7973833501937967
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Specific Character,1.0155147989318485,0.04739069061681955,0.8551535537447938,0.1482118045693428,0.15829041720559803,0.8848951919452075,0.14484877924277006,0.19571371678187882,1.2787964134697352,0.059677165961921075,1.076860030641592,0.1866370872354688,0.19932867351816047,1.1143124639310018,0.13198969491339618,0.19494568795538963,0.733279935456371
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Standard Summaries,0.7968248678130365,0.41636106756205793,0.6936989463711201,0.29062222937648463,0.0801390748304358,0.28339947877167865,0.11865699137902697,0.08894962535252227,0.8973705924582146,0.46889874170238066,0.781231936449495,0.32729380406121444,0.0902512609289135,0.3191596653674741,0.07622126392811324,0.057964123796231226,0.5825981160368892
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Supportive Conversation,1.109666403545995,0.0,1.0972807966959737,0.0,0.18578410275032267,0.0,0.2246316991097096,0.0,1.3234440069017381,0.0,1.3086723087543293,0.0,0.22157547221113097,0.0,0.21795125926340586,0.0,0.9999999999986899
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Technical and Practical Support,1.0533013961014057,0.701177543727595,1.1751281843832557,0.45106000991660244,0.5028766306648564,0.8169777196368002,0.2706695388317135,0.4741663489392879,0.9535805538669531,0.6347938709486511,1.0638734450333616,0.4083561036523087,0.4552670088267635,0.739630716594263,0.1924470504653013,0.19713409865760667,0.5721084559073893
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Technical and Scientific Translation,0.9803260303176984,0.03566110107876952,1.0238374718963932,0.2569794748942139,0.1683078015594025,0.1654738067716857,0.14754208445516726,0.022504278702508262,0.7568420821555272,0.02753147540483858,0.7904342637599836,0.1983961201021469,0.12993883976941437,0.12775090794916227,0.06486387255108078,0.020886579348944834,0.7541988231755412
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Textual ExpansionReduction,1.4618689779972425,0.3917210573244092,1.1955784307217798,0.6349285421159686,0.30323744228569616,0.17098435219224611,0.2499061810107422,0.3177910080513991,1.5640975778571895,0.4191141382561858,1.2791853139890372,0.6793291394667358,0.3244428578301504,0.18294129989799762,0.1758751311471894,0.12195120816213939,0.5480066486277005
rb_Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/Skywork-Reward-Llama-3.1-8B-v0.2,core_eval_runs,Skywork-Reward-Llama-3.1-8B-v0.2,Tone Adjustment,1.6612326391053749,0.22368878303603457,1.3889264130791337,0.10319370989748688,0.4421031868838187,0.12595532389852393,0.16377879963541614,0.06064858311141025,1.1144254033710344,0.150059935253112,0.9317508226794564,0.06922672301923483,0.29658159295494513,0.08449618032904421,0.11090555782980499,0.0186466577245033,0.9020778828767496
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Analytical Reasoning,1.2060409993622025,0.8479043129144532,1.1308096062368116,0.627285700064965,0.5518917753658634,0.919531650204003,0.23604708856181167,0.5639707171810222,1.1199697342693118,0.7873921106519596,1.0501073635771774,0.5825183382523702,0.5125050353419155,0.85390763537543,0.20875051077630163,0.16589648875281407,0.5202867288096631
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Casual Conversation,0.38641063537783515,0.42163526341990004,0.4557123531026111,0.49768476849349963,0.13956335346878315,0.2090505099018197,0.1285179992402864,0.10504939803038701,0.5909000243277863,0.6447656058137644,0.6968763690251467,0.7610606823045182,0.21342059821757647,0.3196805162102612,0.06772905920648337,0.18102976293039275,0.21211565938829505
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Content Categorization,0.9261289049543562,0.13383002493573803,0.8464033501934731,0.26012905668423414,0.20734229215396033,0.03944608569482069,0.17217040952260554,0.10218845558724499,1.029191904211699,0.14872311777278957,0.9405941992056868,0.28907716599433275,0.2304160979578431,0.043835789848229806,0.10908169768112597,0.06853863565353813,0.6618157777925866
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,ContextBased,0.9130522035741917,0.5298812915112621,1.032481276589598,0.46073883291240403,0.4863119215974706,0.5529716831374986,0.2824402797202154,0.17490549551493115,1.0236674052604644,0.5940757874019215,1.1575651701503225,0.5165568011816875,0.5452280395142399,0.6199635528439736,0.19253866680029724,0.18034285595059985,0.5463514222888988
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Creative Writing,0.7030788633947929,0.8062395076160909,0.8904984933859799,0.4736999078038884,0.6228908026791892,1.1597841316985462,0.33810126144655084,0.5789052958710397,0.9930988945451449,1.1388133045077284,1.2578291219076967,0.6691011197729925,0.8798332587843547,1.6381950860241967,0.32460992511974673,0.3131522225159593,0.5037072315167013
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Critical Thinking,0.9685398283494844,0.9864466626718719,1.0426105933243504,0.483091832969322,0.7112468930857063,0.7021363984304565,0.45653843139791006,0.30847499677477175,1.0376522013305105,1.056836818740773,1.1170084550236425,0.517563954783001,0.7619995406462133,0.7522389458234482,0.19893521685699456,0.15983422002232406,0.5555796965577807
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Data Management,0.6283099762241222,0.5719191558580073,0.9674315593636444,0.5670802963883368,0.5503995391723311,1.4956918984015228,0.5900219727062197,0.3582792949130586,0.7673387479824747,0.6984700953510475,1.1814991797234045,0.6925605212433212,0.6721887432326479,1.8266498895722811,0.38620624585081836,0.4349007248174093,0.4625489175548086
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Development and Implementation,0.6019209572227091,0.2010591923917191,0.7864343808698768,0.447906474300771,0.8879590738987407,0.6823053648058827,0.526523220567805,0.5171505155820791,0.8326831798693017,0.27814051937179185,1.087934675082233,0.6196231960848939,1.2283815281318275,0.9438850632978288,0.31041465299668763,0.3289554230468805,0.4766877022148222
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Discussion,0.9602147211645149,0.6887062726886658,1.1049747217283183,0.6031017654679566,0.9112065430190333,1.1243017983927956,0.5551386530888586,0.575259243953011,1.2749418715771261,0.9144417856911352,1.4671494913733452,0.8007789056646276,1.2098704068409798,1.4928113550707063,0.36377439317496335,0.43218736055036905,0.4733208503368919
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Documentation,1.6094160040980892,1.1143077428334807,1.6998402815096774,0.78575398443295,1.074410059343249,0.8054933895193247,0.4755803580963607,0.17182715624106665,1.5649529713761763,1.0835229727642588,1.6528791143121726,0.7640461071456268,1.0447275350588567,0.7832401754066983,0.4426804284133172,0.2608293225767956,0.5956730941996524
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Factual,0.740149151992016,0.4970717299403087,0.8760560904602726,0.5255114828918976,0.570662535905559,1.006866736899156,0.23952763535477917,0.5521844513501651,1.03457046180134,0.6948001329308873,1.2245393399858107,0.7345528344849639,0.7976643649669499,1.4073846901839113,0.22570235454533316,0.5305189484676773,0.38450553257126074
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,General Character,0.8108340243172297,0.4680909322869711,0.8546019227304531,0.3266077427797254,0.2525806104420971,0.25217809936357855,0.23753978527726644,0.3280523704434267,0.9744532047070206,0.5625475687770092,1.0270530803819105,0.3925143150003192,0.30354915791725867,0.30306542364041605,0.14638238170608314,0.1953628446846074,0.5520061908450733
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,General Explanation,1.0810073140936023,0.7475710635861634,1.1358404493103282,0.5985481543642281,0.8048650795431006,2.0412220352581167,0.3121566563366246,0.4137274173067017,1.1410632759876913,0.7891027893409505,1.1989426964942353,0.6318008296066853,0.849579806184384,2.1546232594391235,0.3518257614751026,0.5098773113884179,0.4616759103669589
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,GeneralExcerpt Language Translation,1.6373757980400625,0.9361818645739421,1.1408438661547242,0.4462906642453477,0.3565659115071894,0.10255687760352022,0.1277069884258792,0.008068781998571778,1.9486950323680787,1.1141809663194313,1.357755975961304,0.5311452639327943,0.42436087140018597,0.12205632827869062,0.23480415652887565,0.18407095927064807,0.593372077031581
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Human Decision Making,1.6528479312045814,1.11909860640219,1.4041855315864376,0.5207380723780839,1.1672428583303631,1.7057045079544357,0.2662644951548044,0.5521114720145469,1.197507743129451,0.8108000870445021,1.0173489134234046,0.37728085085274277,0.8456811631451604,1.235802953921854,0.18920017698296598,0.25547748364399464,0.5588444704015757
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Hypothetical Scenarios,1.6232388235750197,0.534691789766728,1.653886387970841,0.5933209644231414,1.2503368526860033,1.202899449481082,0.3399089831869918,0.12950750094979274,0.8943017075225989,0.29458128627897495,0.9111865729980486,0.326882245475105,0.6888563568685782,0.6627213543843682,0.19385624262124246,0.11816891923272088,0.6365647274471572
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Idea Development,0.9211024251445632,0.8452339955155003,1.1454805351000517,0.886335939793495,1.0016046408482788,1.4890259222475202,0.42911802779053576,0.5560809575507283,0.8685142322289755,0.7969773334966094,1.0800819977425609,0.8357326582066904,0.9444203618362998,1.4040134629185008,0.26376708832561574,0.4614549908324738,0.34199020696045324
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Interpretative Analysis,0.6748049144647073,0.29781892873023397,1.0138305058023065,0.5555721883168705,0.3306481249879443,0.7276614912145616,0.5364221272323001,0.2301138343851986,0.747577993475607,0.32993665633839653,1.1231651681927513,0.6154868360765331,0.3663062561140952,0.8061347892867199,0.22129622147967531,0.2942238490569705,0.46019636592296786
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Literary and Cultural Translation,2.4994170854195583,0.22964729630991698,1.76734869784398,0.09119221182697346,0.4184544441652654,0.282739489300855,0.10177306141320519,0.021289008481327254,1.0987404755110137,0.10095265052772118,0.776924171636164,0.04008797682074361,0.1839520253117987,0.12429190899445858,0.08673717353630817,0.010799748706023349,0.9296373268233057
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Logical Deduction,1.9215289847874217,0.9158108770635509,1.716089075617029,0.6416124765019922,0.28965089903932034,0.8422495231284358,0.18123197736064633,0.15352448823516263,1.1111022704924844,0.5295572187060176,0.9923089807028713,0.3710051136681328,0.16748733644961508,0.4870212028431865,0.11801994107915059,0.16268890863172802,0.5553152586320529
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,OptionBased,0.7077911882164737,1.6932953859240096,0.896545976907137,0.7941314507826053,0.15613502909169438,0.04877256190439749,0.15212741731442792,0.05415617338552736,0.5945335215574213,1.4223416250308483,0.7530845901485224,0.6670579909351538,0.13115098100933795,0.040968188735814315,0.08720404464388676,0.14623611999196873,0.3137303859930342
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Paraphrasing,1.6207255836701233,0.0,1.240111544149328,0.0,0.3644197862099909,0.0,0.1325478585669474,0.0,1.0887665294554927,0.0,0.8330786875120798,0.0,0.24480891144957775,0.0,0.0906628166589416,0.0,0.9999999999978348
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Personal Opinion and Advice,1.2088683942552112,0.0,1.5023938714812468,0.0,0.6082040569849503,0.0,0.30406257656071545,0.0,0.8095572184859446,0.0,1.006125902080548,0.0,0.40730321595342744,0.0,0.13324616742753292,0.0,0.9999999999982446
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,PostQuality Assessment Rewriting,1.1058255581544552,0.9317836947403731,1.3682584180848445,0.6511123924445892,0.9156047128526021,1.300601650783933,0.2468224934387223,0.7231397972291663,1.7771014022137839,1.4974098747062736,2.198840436847426,1.0463610079180636,1.471409669531839,2.090113581012803,0.6245046054630167,0.17896788438874467,0.6992660153205266
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Problem Solving,0.6634953348926731,0.45505350028032054,1.1556060339790455,1.0305287160732073,0.5866059015522461,1.5500603460316666,0.5880821485046459,0.11657882719278234,0.6764123855719701,0.46391256652509416,1.1781035873471084,1.0505912409350315,0.5980260544314684,1.5802372093684656,0.30169890676804656,0.5393319540209633,0.30424199440652433
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Professional Content Generation,1.0646712547117752,0.7762769756249029,0.9316615233561074,0.3985754146397301,0.3527960516498446,0.7220067014285445,0.2002651803256547,0.23560963614469452,1.0213803323798842,0.7447125409591898,0.8937789502437723,0.3823688182446381,0.3384509038839215,0.6926489669200558,0.1193845997601759,0.10300719998155103,0.5643936943253638
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Quality and Compliance Assessment,1.259447347341253,0.3386590771848019,0.8899596303160932,0.40549773253750465,0.2092272220826327,0.0261730374470861,0.11450630419722141,0.020640705368239254,1.7355798810922147,0.46668872831564157,1.2264077832404698,0.5587956558138786,0.28832531823582314,0.03606772233561861,0.1738362440644633,0.0868156209268312,0.6209414239772396
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Quality and Optimization,1.3759988479308276,1.2164866527169287,1.4660566111896183,0.25813722679432916,1.0153489215781815,1.381221674608191,0.3273896197406978,0.6623080576410962,2.0804714975404837,1.8392935516783417,2.2166362987493735,0.39029621544184856,1.5351789680481378,2.088368264354779,0.6454326255780094,0.34622339083848774,0.7671092932920991
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Specialized Summaries,0.837537198306755,0.1162373456014626,0.7479637573213036,0.04520777544207044,0.16037612143120722,0.06016068022345969,0.22255682290955425,0.12337577484838991,0.4325437163424386,0.060030448254576985,0.3862837661840973,0.023347427716429114,0.08282579414584204,0.031069813083112144,0.02430853703891067,0.00616028328892837,0.9070091997571439
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Specific Character,0.9352393996096059,0.12377706531615218,0.8434363308613037,0.07736939235004259,0.2853312679527795,0.7689728721512976,0.148155936879519,0.23817780800466898,1.2668510983986272,0.1676652108730461,1.1424970361887816,0.1048025775240009,0.3865023547600111,1.041630761204313,0.16187486212252844,0.1571377302291268,0.8456731467005328
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Standard Summaries,0.570505458411503,0.6772788849960896,0.5540646807003051,0.4424971180992166,0.24378427077495943,0.2595312895540765,0.16633516429100137,0.08070289195072233,0.6585350640911274,0.7817837452078521,0.639557456700836,0.5107748992252784,0.28140044588916013,0.2995772465981732,0.0515246360026681,0.09833573924705302,0.3243097703736289
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Supportive Conversation,1.6893488138858517,0.0,1.4883070745490277,0.0,0.15959073396092704,0.0,0.19479717973227317,0.0,1.3993178639710353,0.0,1.2327913923830103,0.0,0.13219186181099937,0.0,0.20687320706968348,0.0,0.9999999999986108
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Technical and Practical Support,1.397361387122448,0.994251450657781,1.425443352448687,0.5420973391219113,0.6195136365569845,1.1334711971083167,0.313885814096714,0.3935084286703506,1.1619422533508503,0.8267458809303856,1.1852931362150778,0.4507680042969573,0.5151416644801882,0.9425107127000601,0.22591576386594614,0.31419817435999453,0.5503076477230461
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Technical and Scientific Translation,1.150435566466368,0.004398169833568888,1.0415482749596017,0.14871979874512387,0.3090426065477062,0.26020623191134845,0.13729544364353896,0.0014035689051702382,0.8099639813750752,0.0030965307862510505,0.7333019007503738,0.10470615114142678,0.21758139900243492,0.18319815704661918,0.061112730711072594,0.007593122006385289,0.8755765466218131
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Textual ExpansionReduction,1.450060886378246,0.2727650684282972,1.1763173560598916,0.7368707047115619,0.32923442754144006,0.4788114712562951,0.22350352149945496,0.2478799472978729,1.4796961611473831,0.2783396396937041,1.2003580624812678,0.751930324627245,0.3359630779141976,0.4885970669140997,0.15655761733130746,0.14587277867031725,0.4727592477364087
rb_Skywork_Skywork-Reward-Llama-3.1-8B/Skywork-Reward-Llama-3.1-8B,core_eval_runs,Skywork-Reward-Llama-3.1-8B,Tone Adjustment,1.9094340177451075,0.7552285914213948,1.7101026777880044,0.11257220407348867,0.6553273052017595,0.03644197862099907,0.2819258897756057,0.18082914425388683,1.0738846983886694,0.4247480774804805,0.9617787686210466,0.06331172813248986,0.3685625996773222,0.02049533152568045,0.13251660839515278,0.011938748529674581,0.9267497110511966
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Analytical Reasoning,1.1700698205998967,0.5559462786075253,0.8906954462976816,0.2946467279601458,0.2712522456884312,0.4794338133147863,0.124878632105344,0.17781466979441182,1.3395751670983076,0.6364849481217563,1.0197284643203486,0.3373315274808283,0.3105479399148803,0.5488882964731603,0.11964341362884984,0.08934151491692185,0.6322352647570885
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Casual Conversation,0.9152497822226577,0.33979025743309155,0.824988432687619,0.4216878282468288,0.20733579225004364,0.552653795892599,0.14782515882850078,0.14273522752679202,1.2290386993499616,0.45628568742533737,1.107831687025179,0.5662614403486526,0.27841985585274376,0.742128450274155,0.18050693505185006,0.11005840501256228,0.5734440406578508
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Content Categorization,1.0630188229696294,0.2993440706873445,0.7773816833809183,0.027098374556896787,0.1179252524685563,0.08216103655596137,0.09932885582529627,0.049287140181378586,1.1129350528647945,0.31340038570953943,0.8138852353269981,0.028370834333656392,0.12346267465288015,0.08601907660241649,0.07267068270677712,0.0030236582574315607,0.9627755353379858
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,ContextBased,1.0422369137231215,0.38561315907110383,0.7824361981298267,0.23617354414334268,0.23633380515214764,0.36682486321161567,0.09873995226109955,0.05907946158155267,1.0952856346470465,0.40524044782935875,0.8222613462980028,0.2481945196700288,0.2483629377891981,0.3854958482249516,0.08467638465264227,0.051815368873345646,0.6674639009556564
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Creative Writing,1.1323724038271614,0.6432725862116739,1.033289147241986,0.2718454631398719,0.23150080300179698,0.3083455371923727,0.1287207417875354,0.05726783082365136,1.2085512661915017,0.6865479023904776,1.1028023139919667,0.29013350870766486,0.2470747146844769,0.3290890771789066,0.13641725007832906,0.07153588572730551,0.704416996398379
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Critical Thinking,1.5335115823062668,0.5396499994818116,1.2198363177425353,0.31966596371355194,0.41370498407001716,0.31124533848258296,0.12908636833921394,0.05397530610009871,1.1593495015272137,0.4079805885505523,0.9222079854087196,0.24167054227863494,0.3127649458894721,0.23530446863647203,0.10302075820781931,0.06263103749984122,0.6933352395534342
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Data Management,0.6600068561572631,0.29624792868477606,0.6949336873222799,0.24668784100897523,0.2208681982710255,0.2908863169242308,0.19423829504004891,0.13759539310442057,0.9646303897475851,0.4329799792883039,1.0156775606141413,0.3605456306983842,0.3228090347750271,0.4251437369917274,0.11516644655810071,0.13386713659071875,0.5804992541084051
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Development and Implementation,0.7453999629012714,0.35675201830021097,0.7087192805652875,0.19399469256417978,0.07177008193270742,0.1577975202089493,0.11960000308266516,0.12307562638415037,1.4792399992538163,0.7079713999852968,1.406447491591058,0.38498084674829847,0.1424271280231417,0.3131478606030691,0.1726427600016262,0.09036898539137181,0.6980452736387838
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Discussion,1.6675669794516184,0.4695865714334466,1.2171815228113356,0.21245050976011304,0.32526104471859996,0.35039265590042334,0.09618421769313723,0.03051909149360696,1.3481512262007573,0.3796391508625323,0.9840352937587155,0.17175646841739478,0.26295859877060024,0.28327635083014135,0.10171523111432551,0.05606706876710987,0.7671054765939874
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Documentation,0.6047293940626273,0.19319926096026796,0.5769522176412107,0.1227188186638607,0.6975230353493602,0.020661084192749146,0.36334836038018825,0.11401984729715531,1.3215860048663015,0.4222209833728703,1.2608812864026833,0.2681918141771156,1.5243788223962356,0.045153088278149456,0.3065305987889878,0.0808400093552245,0.7901905849003678
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Factual,1.088331671732091,0.663042585632952,0.8648388847933055,0.3292608574357392,0.3271942455787402,0.5025113985827108,0.1808379669382999,0.18883979988010546,1.1863266811785074,0.7227439304804004,0.9427102698556074,0.3589079968811978,0.35665530420553965,0.5477582755505315,0.1108853419311836,0.11974581927922923,0.569518425164434
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,General Character,1.2418399025326041,0.968162660257405,0.997060510029006,0.37644387999141105,0.24503320902277886,0.24140845741001588,0.12822633791352744,0.1346708025464577,0.9846482530928835,0.7676510234435668,0.7905639748133191,0.2984803500556852,0.19428552804751584,0.19141148177462355,0.09876213372667098,0.053863464914114034,0.6400507337023104
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,General Explanation,1.233805036457646,0.8300379130592874,0.9994887579844077,0.4087142380994815,0.5248640335280825,0.7660308408305808,0.13765291495942444,0.14787203430059892,1.0815895713560384,0.7276355048546386,0.8761810702501898,0.3582908519017891,0.4601111587886824,0.6715250337753791,0.11019505868890034,0.1141316102722737,0.5563575826049145
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,GeneralExcerpt Language Translation,1.248817549387173,0.43829075863202815,0.8726824445331456,0.3476938438807218,0.22231809891613072,0.2807974249353738,0.10978508604134429,0.02846431924410686,1.0122503074285203,0.3552640298705484,0.7073676000146243,0.28182915953733767,0.18020371677609048,0.22760512923240994,0.060962023715449676,0.032291883186705506,0.6345917355240357
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Human Decision Making,1.5213082518766314,0.6188508222206832,1.0392431373910986,0.29639560375048113,0.4233709883707185,0.8324846203979026,0.12640577513886764,0.10919385586234912,1.2922265904701478,0.5256630186692312,0.8827518120958056,0.2517637566169495,0.3596189320155188,0.7071274091286885,0.09658186231147659,0.08767856558146106,0.6478839159541525
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Hypothetical Scenarios,1.216270300530905,0.045430220213296124,0.979052123110273,0.25774249436502916,0.2706481204196374,0.19332008601402673,0.11021648674561368,0.07709365668846596,1.2246891989405144,0.04574468354314587,0.9858290215995149,0.2595265615045057,0.2725215189804425,0.19465822784317321,0.10111166606601962,0.06583217797359636,0.6873449893369715
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Idea Development,1.2554025148170256,0.575157462155169,1.1066563853540674,0.4667085857779427,0.5006990227763292,0.8651073849127695,0.1895666602523965,0.15466896078913667,1.2033961001355158,0.5513309386848331,1.0608119408771874,0.4473746749717131,0.4799570211459375,0.8292693705127683,0.1449675568287878,0.16518907347648276,0.5403088846799481
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Interpretative Analysis,1.3389530394912128,0.3651937249858723,1.0961966013544857,0.5359701188161996,0.8061447586784914,1.3095019326375135,0.22092092778289135,0.1717667113032173,1.3645893151674593,0.3721859097248761,1.1171849388382014,0.5462320752216488,0.8215796161073196,1.3345743224477113,0.20142124165747305,0.2656931116182212,0.4899609269134096
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Literary and Cultural Translation,2.7552945259149157,0.528126309979569,1.7622334090716123,0.2546656508085472,0.11212564988813549,0.11985845332869657,0.08287658789444341,0.006646048782308067,1.332388017940426,0.25538800326335753,0.8521697615182035,0.12314961559531912,0.05422101739382193,0.05796039790374069,0.0933475160790308,0.023995081737528456,0.8310944463688744
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Logical Deduction,1.4316787166852063,0.4060023868928957,1.1438239339222978,0.16665382883689217,0.5606282494406775,0.8834727930841021,0.1257406120436928,0.03263021905939556,1.2869834905081445,0.3649690136121142,1.028221277476568,0.14981065503274493,0.5039673307522385,0.7941830005302518,0.13188878877057675,0.034817872131954974,0.8288073745750396
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,OptionBased,1.110865544258101,0.3154742153641399,0.8523939042561526,0.2996025020523283,0.10439284644757442,0.10294294580246924,0.0950764813565742,0.009350528269912117,1.5304215307571116,0.43462373469728277,1.1743293241046249,0.41275753143987337,0.14382033963173205,0.14182283491462466,0.11133216324014017,0.05298373962828651,0.6674184291985024
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Paraphrasing,1.7834986185331552,0.0,1.1082342430873544,0.0,0.25711571439865555,0.0,0.07745237810728178,0.0,1.6099867562474162,0.0,1.0004170654519355,0.0,0.23210160675391245,0.0,0.10252223938709537,0.0,0.9999999999981867
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Personal Opinion and Advice,1.750996679072047,0.0,1.6304669754446905,0.0,0.8892723956645229,0.0,0.15855899682851704,0.0,1.2002357669861023,0.0,1.1176176426876294,0.0,0.60955942899653,0.0,0.13160959505097403,0.0,0.9999999999983991
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,PostQuality Assessment Rewriting,1.408095176504667,0.7720720935185192,1.5620128699873626,0.5365907762456696,1.1753861229652824,0.658254892877761,0.2870934892372553,0.08727017605854448,1.214729610715537,0.6660478987877365,1.3475106776534853,0.4629038687288548,1.0139771454471207,0.5678605559617511,0.1872836192917422,0.14751531396604656,0.6197734721382084
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Problem Solving,1.32158443801339,0.8677655360954623,1.2457264417618443,0.6670751218021509,0.4987658219161889,0.8902389960945931,0.13230648729217243,0.112315406259474,0.9094089278465329,0.5971269811477234,0.8572095094397875,0.45902785615658337,0.34321082959869154,0.6125914225976645,0.09180179519692833,0.07733439379699647,0.5195647590948601
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Professional Content Generation,1.1149167233555164,0.7039720725937344,0.8959077048667751,0.46434499810521657,0.19283678579899166,0.4894622927767639,0.12244997448629735,0.09568607153560235,1.1760928275778064,0.7425994139730626,0.9450666617452124,0.4898238678755875,0.20341784809611602,0.5163193625547406,0.10669857280473033,0.10546468976793688,0.5066094426406982
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Quality and Compliance Assessment,1.5628720703696473,0.34574089133071095,1.1196589231705445,0.19025247214915142,0.13339085934967843,0.27838092386019847,0.11382805849870203,0.04348967614118893,1.6301453613284302,0.3606231891334599,1.1678542565833647,0.1984418244046371,0.13913262303104654,0.2903637350213145,0.13003283592886972,0.01751114093844469,0.860490456097708
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Quality and Optimization,0.7213255709398372,0.1527530742145209,0.6361439080399066,0.2504283832982571,0.2735479217098478,0.22690945095896384,0.15724145020735913,0.07095566193925068,1.3904384786171524,0.29444922053462896,1.22624096983573,0.4827296775602927,0.5272952622427526,0.4373942148815766,0.17182826537782914,0.1310228672892848,0.5949634460651645
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Specialized Summaries,1.287995073068453,0.07491153333043532,0.9972413280782073,0.015403516228496006,0.3489427552553182,0.15078966709094088,0.11130643434073084,0.0010284116771607232,1.1714506938243665,0.06813315480029125,0.9070058341950361,0.014009727327774835,0.3173686307471642,0.13714544708187706,0.08936550513895541,0.006469835610964536,0.9748611887507104
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Specific Character,0.9643878212731761,0.5311469363235384,0.9761011389847899,0.16206667210842585,0.7210839208323196,0.5439543920219677,0.31096842973676797,0.13286577070014877,1.3818209609291188,0.7610527152615796,1.3986043623563185,0.2322168733858503,1.0332035042495689,0.7794038498879957,0.281562296317022,0.08952507602053211,0.8040032059194011
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Standard Summaries,1.1773193238254227,0.5735452028440756,0.9015587933186174,0.20995618560343687,0.3460429539651078,0.36187103600750625,0.13153277054779022,0.11217198276687235,0.991586898059738,0.48306342814222925,0.7593300043530502,0.1768337600494848,0.29145164984021854,0.3047827134327704,0.08379178402765097,0.023447331284712047,0.7755336174347093
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Supportive Conversation,1.6446706317643323,0.0,1.285920909645594,0.0,0.5374298391189942,0.0,0.12864200178976737,0.0,1.2865986852572049,0.0,1.0059546998294313,0.0,0.42042249133294496,0.0,0.11875352696056551,0.0,0.9999999999982218
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Technical and Practical Support,1.4880813620929707,0.49658341938267103,1.2161897504950658,0.31477695411640694,0.26098211611893607,0.7553982360998094,0.16461101131913924,0.13011191855654203,1.1972368946766228,0.3995265353860833,0.9784863094933909,0.2532540173287565,0.20997334300921605,0.6077561761544532,0.1287471636570282,0.056240674867893914,0.7224910669460046
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Technical and Scientific Translation,1.9631654734724413,0.4016224786941407,1.454891391075736,0.12437261158718416,0.46735130793890955,0.29046342923607504,0.11666543251745837,0.02077949694940734,1.539960073066215,0.3150435304574162,1.1412561412571365,0.0975612390373084,0.36660300114599453,0.22784736679290868,0.13983469974076168,0.021884046038886495,0.8953215308787852
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Textual ExpansionReduction,1.4391472153206701,0.09273322875985346,1.1850655522719569,0.6296495614003728,0.1208250537587667,0.1073832665281039,0.11095034708670304,0.11619154030929463,0.9865364322620158,0.06356869379238494,0.8123632728389716,0.43162521892581895,0.08282565966434521,0.07361130502668681,0.07070568768010949,0.08648063822633315,0.47672230668675575
rb_Skywork_Skywork-Reward-V2-Llama-3.2-3B/Skywork-Reward-V2-Llama-3.2-3B,core_eval_runs,Skywork-Reward-V2-Llama-3.2-3B,Tone Adjustment,2.0858029030375898,0.7840337738406373,1.6824150360579737,0.020721496719628307,0.24068350708746328,0.13049105805946803,0.16795762550930377,0.09598432416295338,1.3507812042742997,0.5077460021164297,1.089544273423913,0.013419392720445122,0.15586839824563548,0.0845069629042602,0.12277558079458045,0.007641058940226209,0.9796465169221586
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Analytical Reasoning,1.3358110657338784,0.3752718353568014,0.9856020672358676,0.2837404135348427,0.3002519008428772,0.26650799455548957,0.12226658085951336,0.08648964116629723,1.2823753498600592,0.3602600423840012,0.9461755694470277,0.27239010172166855,0.28824108907796664,0.25584702172746127,0.10229198110935761,0.05251227458701771,0.6914977246038907
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Casual Conversation,0.7666624116633816,0.7307341900136406,0.8274012038656968,0.1608380027300561,0.3061054356070158,0.8470140124928033,0.2047455285573021,0.38465143099949184,0.9694478054689626,0.9240164198905643,1.0462522606131959,0.20338032282600715,0.38707159536815505,1.0710527386246043,0.14097697716830493,0.06989824093705249,0.7563266340526077
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Content Categorization,1.0994315589349848,0.48093674598886316,0.918636720897189,0.3330584764000595,0.3911538524742069,0.2210570187398247,0.11304051074772736,0.04745142865755636,1.1588103282264022,0.5069114707016057,0.9682510124549265,0.35104650145717864,0.4122795279878462,0.23299600085228633,0.1232043382444632,0.06076404726584994,0.6584255362030264
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,ContextBased,1.1829305107175512,0.5035546321547077,0.9057842906041966,0.24284158919271892,0.29577566837618285,0.3933919687075541,0.11682106947430593,0.055762342482174404,1.1338045280400455,0.48264248565727386,0.8681679277099918,0.2327566082103424,0.28349238526747167,0.37705477318752784,0.0847918296498143,0.03761813931295008,0.7158923449294208
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Creative Writing,1.223582448840558,0.4574365255387183,1.0064684361571323,0.3536062246075493,0.22514588493536275,0.4937628736338141,0.10904951304026095,0.08064021356424175,1.2732244724953776,0.4759952052933105,1.0473019165350879,0.3679524001211756,0.23428028969602055,0.5137953514419323,0.11378528150415923,0.058298722324644225,0.6591307231921963
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Critical Thinking,1.5353477360761345,0.5528147096367422,1.125954192760077,0.2970764538793246,0.32642064449432057,0.5598733839111448,0.1347388270077713,0.1020768867732349,1.1841930491214245,0.426378546841096,0.8684333179812564,0.2291310714018554,0.2517638507663401,0.43182280732707695,0.09109830780840744,0.04518063627361124,0.7086140811219368
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Data Management,0.9367162047009686,0.3912722143903933,0.8266049509749869,0.20927911139807198,0.10019874213907942,0.23362490220400475,0.15275058781179368,0.09930321062469752,1.0581948993561765,0.4420146245467025,0.9338037908541681,0.23641961899648112,0.11319308593292343,0.2639227106717819,0.10791215564545226,0.07154405058673971,0.6932071865208508
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Development and Implementation,0.9133235859928115,0.424682555277619,0.7247479464343837,0.10760771986239792,0.1516754155060635,0.22269256521803987,0.13200483597961998,0.12611102676106672,1.3080211031637323,0.6082113206840974,1.0379515244646418,0.15411095322841617,0.21722273174123435,0.3189303104509496,0.12544387020428455,0.030856634763910484,0.8309576410375219
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Discussion,1.4917475102226605,0.4125881194781853,1.0240577342474118,0.3781413645900077,0.4076814800435396,0.23026772785398408,0.11430299671894628,0.038848050623590785,1.5761310162283788,0.4359269431191102,1.0819854876881871,0.3995316426975465,0.4307427503885053,0.2432932553397913,0.1121493707505517,0.08870299577267501,0.6044328328920846
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Documentation,0.6166655833323282,0.020035444358136423,0.6304439863938716,0.05689322550116105,0.6489245855802837,0.08767389937169445,0.36675411583185036,0.04750685979625546,0.8527855944582525,0.02770697569152447,0.8718397203298943,0.07867752707630538,0.8973965037617919,0.12124405904110702,0.15336835384040937,0.018802736817790544,0.9012602737303611
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Factual,1.0632773736270695,0.3869789048850786,0.8193370510911635,0.31029413230762026,0.2623760876631564,0.3892600618152209,0.1188206428939379,0.11332116628576305,1.2055143772326586,0.4387459425881979,0.9289416095825225,0.3518028756614613,0.29747472650624546,0.44133225500696915,0.09251083455267972,0.12762868117967058,0.5519554188613996
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,General Character,1.362108931475707,0.8100689543812774,0.9713711390705198,0.23980841564357291,0.4207658518692613,0.13506170654314054,0.09459620080257863,0.13569325898301698,1.2487193790309938,0.7426342918046681,0.8905087820506679,0.2198454242161243,0.38573895312058004,0.12381841600781296,0.09908678478180544,0.053473162984941675,0.7126440876035592
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,General Explanation,1.3414924377108364,0.42491927910999205,1.0377781519257727,0.2856324108314623,0.7423659383225265,0.6642901143361475,0.18378087395418075,0.09810958656761237,1.2380758534373686,0.3921619565920311,0.9577751130052288,0.263612809784443,0.6851364322410078,0.6130795277833787,0.12996333491219436,0.08163373864550932,0.6819817502428593
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,GeneralExcerpt Language Translation,1.1551262205878925,0.6521526378399191,0.8782460158922301,0.2446637734543583,0.1477156714009109,0.17973794981649294,0.09387442436858012,0.0401494642485746,1.0471897029623465,0.5912146351920959,0.7961815497894071,0.22180206775924505,0.13391292423305662,0.16294299871714585,0.0779068731568302,0.040543886202526885,0.6956567829386074
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Human Decision Making,1.5328513756620166,0.3064497611813768,1.1295807824439432,0.3156730238321015,0.36119752750479145,0.7969415418587605,0.1298701314830354,0.11136031386230766,1.3609038529261492,0.2720737752803444,1.0028701173689327,0.28026241892656867,0.3206802137854845,0.707544666164446,0.11294737340917271,0.07208268848230373,0.677932523242687
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Hypothetical Scenarios,1.3768718904764392,0.1735400894779935,1.0013083348413598,0.2705609935036981,0.33055255138665374,0.23551869286299082,0.10060389051903729,0.04359739505253929,1.5186486061131954,0.19140953985146636,1.1044132119439012,0.2984207016607867,0.36458959971707844,0.25977008979841837,0.12070270326936783,0.08106613525042117,0.6801988733381179
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Idea Development,1.309986647656796,0.4530894151624094,1.020298846727303,0.3358510404983719,0.5324995007494375,0.6854661371593551,0.14566420060904267,0.11018585154552213,1.2833708899368377,0.44388373499688183,0.9995688439024486,0.3290273603182037,0.5216803998652435,0.6715391244951399,0.13348879182090168,0.10970711709378636,0.6249620709082027
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Interpretative Analysis,1.2625127590917598,0.4514323483357965,1.0860602491306288,0.39001925188369047,0.8281030063384398,1.3654230651289327,0.18263511633243262,0.22300190839172687,1.2314725163529794,0.4403333954170463,1.0593582823432144,0.38043020641536873,0.8077431975805858,1.3318526611250783,0.1771229721273947,0.22705600301798104,0.5706077880835592
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Literary and Cultural Translation,2.160987304690249,0.10192037001088483,1.4235853933980946,0.0777888860077447,0.09365655622621856,0.04820558041055367,0.08261521514550008,0.0015650299241849464,1.2777253264337796,0.06026237995927319,0.841722257015603,0.045994175695267425,0.055376241043656474,0.02850247700776436,0.07801778330506504,0.005170688424830316,0.9418592753219313
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Logical Deduction,1.564098921535286,0.28639279647484295,1.243187486230743,0.23459092673207338,0.6211633361474201,0.6500866843937524,0.12749174114495454,0.014127154284563681,1.4562140089935522,0.26663863554879175,1.1574376839782543,0.21840983916458656,0.5783181226690959,0.6052464609751957,0.1445308537044746,0.016210068383723963,0.8687578578316894
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,OptionBased,1.0811823034938466,0.7441736475879223,0.9831069023966607,0.1519193127879026,0.18800176360115933,0.18559148458063163,0.12115744105969173,0.042904919228626826,1.444680071457373,0.9943677721137729,1.3136313324913087,0.2029951867922255,0.25120870032347953,0.247988075960358,0.1314208328446388,0.02514270588191503,0.8428508757739727
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Paraphrasing,2.3889308349172955,0.0,1.375178956402497,0.0,0.176294694072882,0.0,0.06752083959632116,0.0,1.6356577732431377,0.0,0.9415601811334993,0.0,0.12070579127997788,0.0,0.09171246739489813,0.0,0.9999999999980644
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Personal Opinion and Advice,1.2351819166268476,0.0,1.468362064963969,0.0,0.2658193434067674,0.0,0.23450275494195527,0.0,0.7037013029696104,0.0,0.8365474627155285,0.0,0.15144118918183544,0.0,0.08543919201947303,0.0,0.9999999999978307
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,PostQuality Assessment Rewriting,1.3001733687875048,0.4292018284411081,1.3220380427594345,0.3590933156614736,1.1163035120786786,0.8057218440049685,0.2204208469452953,0.06382363948491321,1.1905815211540005,0.3930243289508639,1.210603217815356,0.3288252753516788,1.0222100878128362,0.7378074045286973,0.15029442765257595,0.11868957628255827,0.6675023069188467
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Problem Solving,1.4427241565729991,0.9801227474188645,1.1398515555252624,0.7701726196020071,0.33365148155590363,0.5585391223104955,0.1587818581524355,0.047383232263844666,1.2028586006413848,0.8171687366887117,0.9503412282739088,0.6421246606609414,0.27817899379988115,0.4656770901110755,0.1339081852354177,0.10292721930308169,0.4871591129994327
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Professional Content Generation,1.1928298709804324,0.7080625029768022,0.942428661625612,0.29532732778456683,0.2875118545915165,0.3237521212930221,0.10213826896855,0.03470333742928397,1.2066122455547612,0.7162437054059818,0.9533178128282318,0.2987396432811742,0.29083386737049166,0.32749286682048473,0.09874735290027603,0.07711646448129245,0.63937594235439
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Quality and Compliance Assessment,1.4783618535193728,0.26775617476254876,1.055329191618901,0.245299392315958,0.08194948669794125,0.08470409129283002,0.09886461026528204,0.0013244534776347128,1.5453180767410681,0.2798830719519676,1.1031259179468915,0.2564092033740246,0.08566104629425275,0.08854040919489993,0.11887934770498554,0.027083871809878146,0.7944474333674219
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Quality and Optimization,0.6991530787402088,0.500756986863024,0.7137295280548285,0.27228082801313735,0.5047812920133692,0.36223050422787473,0.17310668631311643,0.08273671003089067,1.2098529038138044,0.8665374051601642,1.2350767926399313,0.47116970580762785,0.8735012839158026,0.6268235679941503,0.19320140391619955,0.11434058839559208,0.6288086449768454
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Specialized Summaries,0.9653167477213374,0.18163174047547903,0.8168646021752652,0.19484045209194234,0.028923348246332203,0.017216278718054884,0.13503137295187073,0.026471058112128132,1.1836837835321656,0.22271917096957994,1.001649857643864,0.23891586265515063,0.03546618077998998,0.021110821892851177,0.1181112340759149,0.05441725174723294,0.7302865229994274
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Specific Character,1.0167934210883214,0.0719210043446743,0.8425168574651668,0.16360635643642107,0.22381162333471347,0.4545312785050465,0.11251361340190091,0.09899994974708376,2.25752413160133,0.15968179918328085,1.8705885557095074,0.3632451686629343,0.4969152338438416,1.0091679475015174,0.22016896517537565,0.18821929131873,0.7287058974231748
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Standard Summaries,1.2399163932743127,0.4648388528765943,0.8160637164317751,0.29016603089388326,0.25015252977333746,0.35241722535858344,0.13678411603762664,0.09687301338203202,1.2536763532487099,0.469997397472683,0.8251199754953447,0.29338614556668463,0.25292859501116016,0.3563281720494459,0.1117457771475523,0.048596509649837916,0.6762145252200552
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Supportive Conversation,1.355276220859479,0.0,1.114339182499945,0.0,0.22450027448343568,0.0,0.14229926653439606,0.0,1.0218694022848778,0.0,0.8402044519320938,0.0,0.16927173794408779,0.0,0.08889707801832225,0.0,0.9999999999978474
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Technical and Practical Support,1.4805138883591296,0.375078152221223,1.2289266686926208,0.28283237438995634,0.43109561910009425,0.572183023194554,0.12289217181448964,0.06657938141093384,1.2725500847123696,0.3223919330550016,1.056302645079095,0.2431036715151933,0.37054077703584787,0.49181001297070304,0.138952526886011,0.046890570449846236,0.7605932975182071
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Technical and Scientific Translation,1.8318120556010395,0.4166339449769283,1.5214647202098806,0.19773374337650407,0.4345388748437053,0.39735171281270676,0.1304691200015317,0.012665063691198963,1.2586078009173924,0.286262300584595,1.0453732738692143,0.13585958840990875,0.29856448209732134,0.27301379741704346,0.11637047851892302,0.029693425309053834,0.8418010205634571
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Textual ExpansionReduction,1.2876054853233247,0.48295965873823454,1.1734268857502026,0.271403276028481,0.10708525362630136,0.117328939463544,0.14243122890865167,0.14608455414546886,1.176426396950899,0.4412582096599158,1.072106618854444,0.24796879306450104,0.09783891147258447,0.10719809994946354,0.12277397143705238,0.0989463110036238,0.698074880824994
rb_Skywork_Skywork-Reward-V2-Qwen3-1.7B/Skywork-Reward-V2-Qwen3-1.7B,core_eval_runs,Skywork-Reward-V2-Qwen3-1.7B,Tone Adjustment,1.9300309256875425,0.7577745077751856,1.49864358630917,0.08138995763960466,0.18455850785754835,0.019282232164221452,0.14169808671516315,0.07248397098159243,1.2276958368424047,0.48202160705248154,0.9532896428418172,0.051772285524039785,0.1173979695013655,0.01226545950014267,0.09362523216781199,0.005466301540815899,0.9452810634483169
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Analytical Reasoning,1.1916195909074951,0.40249102777146417,0.8861394071638476,0.261206163169581,0.20384866275277236,0.36698534045152076,0.11183465729621922,0.09837288289994006,1.3161339784207606,0.44454800986956977,0.9787336430897156,0.2885000459404923,0.22514916131937412,0.4053322720069752,0.10719918600960943,0.05348637207797416,0.6903243826436023
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Casual Conversation,0.6759344468898585,0.6266144756342522,0.7200832100360118,0.22524130304736367,0.20038381296094054,0.3384003296689082,0.18587211723134484,0.16014341387254974,0.8981700045134866,0.8326344795686085,0.9568341175450078,0.29929674853165045,0.2662665455499187,0.4496605063177302,0.12579681719153885,0.05426237013660787,0.6958441332731976
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Content Categorization,0.8881564966395578,0.8485994615161445,0.7154914820132717,0.3053398879051794,0.24369443535883833,0.13512914188144115,0.11874285135285712,0.1558954073929938,1.1475296328263314,1.0964205431978504,0.9244403218932539,0.39450994366509906,0.31486183683531327,0.17459163464327795,0.0919442432024794,0.029238408515621206,0.6945885755588875
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,ContextBased,1.072082273089297,0.5685241033430718,0.8246743528841204,0.2940578216667735,0.08835366969171152,0.29018117006591526,0.1010698930577984,0.06349387841396797,1.0469833569969056,0.5552141745022642,0.8053675954587396,0.2871735248384544,0.08628518913036712,0.2833876309673822,0.07302324413962119,0.03695691257713471,0.6581151359031826
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Creative Writing,1.1771105324041993,0.45843391503541553,1.0440546860579167,0.3502526023075183,0.1824820890364761,0.4045212131963656,0.12738701458324414,0.06892246808835095,1.293382033723523,0.5037166629928178,1.1471833239093572,0.384849519750876,0.20050713075691823,0.44447862371905444,0.1357714537040678,0.04973061503355669,0.7045471968749829
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Critical Thinking,1.298741196971629,0.3626542782117308,1.0584795294621054,0.24924461141427257,0.309526581403643,0.31645628098730666,0.15701385362793485,0.031541810998685804,1.1203321403713904,0.31283618681780045,0.913075399122407,0.21500569138247427,0.26700668174258124,0.27298444327413157,0.09298075181516363,0.028923091001303525,0.7648764902680856
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Data Management,0.7674642288907492,0.4667170715690132,0.7715827482613641,0.15639520888719594,0.27314565858940887,0.15591824063243215,0.15142832777524545,0.08572368978933043,1.0931685601760914,0.6647872434056192,1.0990349389869185,0.22276780973200627,0.3890660112590604,0.22208842080326915,0.13378312771535267,0.06619239656056447,0.750079753400241
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Development and Implementation,0.8982623085324005,0.2591057984954237,0.752642371447913,0.16761010355845718,0.19403158834258216,0.15158717839264232,0.12148399752702782,0.05754797257047334,1.2564914687016093,0.3624378115480895,1.0527979519175596,0.23445341431889577,0.27141185051992334,0.21204050821869005,0.11473024917507407,0.0281597950867066,0.791745121560859
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Discussion,1.3859399167327298,0.3003591663294214,0.9704486843169708,0.1605841581472428,0.21886301185071028,0.15938309042426396,0.10763435388595272,0.031058492203536875,1.5336597088998847,0.33237281503814686,1.073883527505658,0.17769994951774448,0.24219042903044014,0.17637086652348682,0.11283337163188883,0.02416849680651964,0.8296857847551963
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Documentation,0.45071921042078983,0.1411926290171469,0.43185502822081656,0.1304772602164817,0.30028698195875814,0.2390746356363959,0.22369490855281193,0.1086060917827748,2.1753720569069523,0.6814586392232544,2.0843250948839414,0.6297414873258291,1.4493189873050805,1.1538808860467369,0.46441335084966123,0.33444878604673656,0.653327953693466
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Factual,1.0121692454388718,0.6524023420536675,0.770495972458602,0.2082584260077991,0.2748780834853248,0.36092185331581506,0.16207322276512626,0.19971106113963516,1.1935524267265039,0.7693144225327444,0.9085707176493156,0.2455788405624526,0.32413690208062523,0.4255999239504008,0.10229093058165517,0.09625294718484151,0.6570787477015051
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,General Character,1.2152960644850124,0.39095055151169067,0.9649847284820509,0.18663369291031162,0.2275251363302898,0.46169123476159063,0.1180001983783931,0.08378402989602493,1.1247748045312953,0.3618306213673972,0.8931078945063673,0.17273229264599366,0.2105779391709814,0.42730218874923764,0.09312065576202455,0.04358202658416102,0.7605575897295788
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,General Explanation,1.072948485537255,0.45317347902333727,0.8679909906462402,0.3425502260565453,0.42964137418714626,0.6871952087133119,0.13104227398189827,0.10401684758581387,1.0259548351269046,0.43332511133791085,0.8299741932663189,0.3275470028932008,0.4108236799431738,0.6570970149628721,0.1035720690104518,0.10398631681358272,0.5674615745918864
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,GeneralExcerpt Language Translation,1.1337999100063014,1.0278693460581096,0.8689434233147124,0.24527533036554472,0.13801651670796766,0.24109579801496447,0.10054539445014499,0.10458349108468507,1.3105178573848195,1.1880765919800813,1.004379929175244,0.28350478561796755,0.15952824494300472,0.2786738170029476,0.12206793676295796,0.0316856669178186,0.7630400915119168
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Human Decision Making,1.318664083274662,0.33392489868779207,0.9511774624713136,0.2312907543332181,0.328583255258718,0.5015370073676566,0.10722793419487503,0.05396630227637009,1.2749997096562957,0.32286778283720285,0.9196815200053761,0.22363212008377387,0.3177030150183632,0.48492982169323084,0.1004459161046316,0.043962259021505246,0.7315199932072509
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Hypothetical Scenarios,0.9193401447660441,0.21799679940275252,0.8498506572743058,0.2763097401701442,0.2402295855670065,0.13686156677735709,0.11112343096076915,0.031021377185772525,1.2544267395873125,0.2974535767551574,1.1596093080565295,0.3770207669611314,0.32778990180170986,0.1867456892476087,0.1019074808982322,0.08261221201316238,0.6286925820489309
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Idea Development,1.1859892099957683,0.506337268016757,0.9850199015838932,0.31000580541999,0.40480995067901815,0.6109685132930117,0.1525043110640215,0.10977758847918659,1.1900647742912862,0.5080772586284984,0.9884048496993016,0.31107111746610694,0.40620105016588764,0.6130680614486579,0.12804584674400815,0.09460445173965182,0.6431283029774385
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Interpretative Analysis,1.110556542652763,0.2406265996056538,1.0870444890306448,0.42508874603184865,0.6236729625297284,0.8309864750743325,0.14544568765541227,0.16500212861821956,1.1566028014430916,0.25060353842122485,1.1321158833596017,0.4427139147259913,0.6495318949930653,0.8654411082361307,0.1597577823124754,0.1885880350666716,0.5557240199413166
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Literary and Cultural Translation,2.5574922525958654,0.24049125391066095,1.460562515027182,0.2829938123814124,0.016169299028548516,0.06294477121827816,0.0730297134415433,0.004586470439427226,1.4011026974821628,0.13175130608238805,0.8001580757747413,0.15503600978493892,0.008858227610386803,0.034483814626148634,0.07776099751347532,0.01619587930917543,0.8181140019829172
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Logical Deduction,1.611443890684451,0.28296273299959895,1.2277438581815936,0.1202110386110542,0.5220373686359949,0.486233920787066,0.09687663922865186,0.02356442495664335,1.463582009579657,0.25699881193120655,1.1150892895459574,0.1091807877898221,0.4741365836445117,0.4416183666246448,0.1184420084973703,0.04501018664928613,0.8558107348540299
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,OptionBased,1.3830525419062034,0.6332012994572658,1.0161514165537895,0.20479508005702274,0.1131850931998396,0.10048064396312292,0.08746042557616729,0.006015458647401828,1.409603288783984,0.6453569963054857,1.035658686330564,0.20872657374049086,0.11535793094015066,0.1024095917529909,0.09300274374308093,0.016691598296504573,0.8270691432620665
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Paraphrasing,2.187114246723176,0.0,1.3452660289854448,0.0,0.05774749653053041,0.0,0.07297531435199933,0.0,1.5557548356715405,0.0,0.9569249219579777,0.0,0.04107739095472565,0.0,0.09100085161113403,0.0,0.9999999999980915
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Personal Opinion and Advice,1.8473424140116679,0.0,1.5178929463049917,0.0,0.6698709597541528,0.0,0.1292401958160606,0.0,1.2797722061480594,0.0,1.0515414954236242,0.0,0.4640624442424973,0.0,0.11909740608964159,0.0,0.9999999999982916
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,PostQuality Assessment Rewriting,1.524353447479345,0.47410694651565466,1.4887104093430008,0.6431787835800408,0.8246342504559743,0.5058680696074465,0.23601127637804176,0.07382526584174898,1.2262016712506025,0.38137528480047156,1.1975301364412017,0.517377974670446,0.6633421518819407,0.40692417720488794,0.15008228937252788,0.13733942438856783,0.556016863019178
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Problem Solving,1.3980668910041412,0.5399390925604594,1.0459675718804904,0.46358406937009133,0.30837163147303237,0.7403229055213998,0.14132786362026062,0.03670863091985466,1.2484267038936063,0.48214744656774955,0.9340138562357897,0.4139649793763508,0.2753654935477843,0.661083450801984,0.11520647332199452,0.06280619712398872,0.613284686978858
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Professional Content Generation,1.1521347401547448,0.7781294696562315,0.8850696122330037,0.3480547386049302,0.2650610090751346,0.4391697111146838,0.11563067871779459,0.0898216952110677,1.1952422573560493,0.8072434511455342,0.9181847959037623,0.3610773262489694,0.2749783578103761,0.4556013967642506,0.10680395797902398,0.06968712096715046,0.6121779325162785
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Quality and Compliance Assessment,1.7968133545474536,0.420510051298256,1.3597690721245184,0.2603088713398096,0.15938309042426393,0.08777619472640624,0.10766850297662311,0.011837278431516163,1.6603465174932166,0.3885725790477559,1.256495471709998,0.2405385773140014,0.14727804558191476,0.08110964829148928,0.1382642179964576,0.03423345100985947,0.8057975459868268
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Quality and Optimization,0.8101973763233417,0.5419602549390279,0.7489208661159455,0.32501814939652063,0.24484938528944894,0.23041251115681632,0.13819967353243878,0.11467689057596614,1.4603557647309615,0.9768666323592357,1.349906746332445,0.5858351828628645,0.44133345990443884,0.41531143986290353,0.19896760900640642,0.1573680065037602,0.5709553323096643
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Specialized Summaries,1.35901514647537,0.19598056635048766,0.9815229698495445,0.034768805202756936,0.1131850931998396,0.1131850931998396,0.09406250483836737,0.009398445086688023,1.089637680510199,0.1571342381996701,0.7869701930902817,0.027877099349112644,0.09075008673925702,0.09075008673925704,0.0679163179748602,0.006450253218417745,0.9497083428182639
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Specific Character,1.0169334139026405,0.60750366350118,0.848519256659852,0.24168870301609946,0.23849716067109059,0.34359760435665593,0.09691687364149859,0.2537859775890998,1.751885291206909,1.0465549837306463,1.461755887579214,0.4163604795321598,0.4108623652858906,0.5919203567678086,0.17710387513444703,0.13212756911072898,0.6621463092115911
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Standard Summaries,1.0735259605025602,0.6060599760879166,0.8838675419493214,0.18465814704375938,0.29104738251387324,0.44869804804222124,0.12332053624057038,0.13747808342704476,1.081862227667809,0.610766222666684,0.8907310517660115,0.18609207570932773,0.2933074570976738,0.45218232969224714,0.09479798898023462,0.03253637804226722,0.771255610393404
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Supportive Conversation,1.4419549883673444,0.0,1.142566300799061,0.0,0.18017218917525488,0.0,0.12072648208198833,0.0,1.2512655471122276,0.0,0.9914691228330634,0.0,0.15634555494554064,0.0,0.1070988046294401,0.0,0.9999999999981795
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Technical and Practical Support,1.2225145015513288,0.4898431393202243,1.1203655965773238,0.25577328671647415,0.3672740779341734,0.508755444433973,0.15860413778565252,0.06726661871699707,1.0317174406155722,0.41339363202747237,0.9455108502906895,0.21585491248904753,0.3099538461178573,0.4293543057072835,0.0979449993932755,0.027404383702798574,0.7776857152726232
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Technical and Scientific Translation,1.850951632544826,0.21409884338694152,1.5413368257992612,0.04498690389440885,0.448120573076916,0.31876618084852787,0.12124734180849583,0.009664522907006479,1.257473611640167,0.14545147539679948,1.0471318380302674,0.030562573068878773,0.30443788242189207,0.21655890605268613,0.10999068279951646,0.00907275688812098,0.9577991509495077
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Textual ExpansionReduction,1.1794565244507522,0.25417921519766296,1.2583540415855894,0.19999682452932976,0.033493547987707636,0.07175126443918405,0.1185582990111268,0.1318686383666494,1.0240514555350146,0.2206885882556182,1.0925534440230205,0.17364526374298983,0.029080441590516648,0.062297325303822315,0.11251636548195293,0.0934470585533109,0.7658180453413485
rb_Skywork_Skywork-Reward-V2-Qwen3-4B/Skywork-Reward-V2-Qwen3-4B,core_eval_runs,Skywork-Reward-V2-Qwen3-4B,Tone Adjustment,1.690124854707299,0.6934030645903442,1.5037768915975287,0.06657003072269463,0.06467719611419406,0.004619799722442433,0.1977929251748547,0.1231556818911973,1.2820690568586621,0.525991089099868,1.1407120697424347,0.05049767552142059,0.04906183800057065,0.003504417000040755,0.12950436218385253,0.015230800219809382,0.9377005214715962
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Analytical Reasoning,1.1370839794716052,0.4428471178061758,0.8987349146989825,0.31569615881561175,0.2777641510007547,0.46530006704374305,0.12433697736148575,0.1344761302239261,1.1436744061793687,0.44541381606710984,0.9039438937119344,0.31752590264646174,0.27937404465176296,0.46799690038616165,0.10259945161692047,0.042958798982063284,0.6824742182325704
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Casual Conversation,1.0151370496094547,0.5400853925568146,0.8774348817969867,0.02316436416704193,0.2919230432556657,0.8998531278672626,0.10573494409553458,0.33018960554895144,1.4088977218087235,0.749578669646833,1.217782373793647,0.03214957026211285,0.4051568315279291,1.2488964148263435,0.17002785022196626,0.12896157594963079,0.9502557256902092
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Content Categorization,0.9647480507022714,0.28227556274864307,0.7893428195813827,0.04296734099115951,0.18656422735882774,0.11049488465597238,0.1133244372592237,0.10720229433055772,1.0400955798452045,0.30432149087988825,0.8509910717430528,0.0463231217830139,0.20113503011682804,0.11912461605133565,0.07549336577209576,0.023905815175037426,0.9112856094696203
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,ContextBased,1.11299998137134,0.3838864365780045,0.8423538355728089,0.20543346455694056,0.11131474698323285,0.3589695624627445,0.11089537751911638,0.05785680865788381,1.109574663414719,0.3827050051974814,0.8397614458449483,0.20480123190032284,0.11097216979719499,0.3578648141170445,0.0748074244225777,0.047221015042900355,0.7034011365852271
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Creative Writing,1.0736118864955457,0.4717982351190667,0.953997780656876,0.4169971140815233,0.33203990464458033,0.37448187824202195,0.15152882530922912,0.06525654093751909,1.2107177618756848,0.5320493471293404,1.0758283066345833,0.4702498351777482,0.3744431439902561,0.4223051804764688,0.1314536722734348,0.07164745729663222,0.6156113028549254
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Critical Thinking,1.477522050600716,0.5857894638796526,1.1714555387200467,0.3252515795316251,0.47640508057700653,0.5408141590699351,0.15682876059289463,0.07773112511564362,1.2551188188581957,0.4976138120614042,0.9951228081539697,0.27629325610912003,0.40469445568595713,0.45940839141972284,0.12890880325694287,0.07889659630768031,0.6824950714469327
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Data Management,0.8038641671391157,0.32738968022752624,0.7810911775809717,0.10974201872112166,0.16629757766062175,0.2800545600419903,0.16386061601014806,0.10532101767639951,1.0388234010981283,0.42308150431530567,1.0093941574111547,0.1418182098313645,0.21490423666302155,0.3619109327776677,0.12401336556836862,0.052725410862388994,0.8096105252546093
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Development and Implementation,0.958987324931891,0.23595767915433774,0.7740116109564075,0.1358098812182149,0.16962908172060082,0.11854601946758844,0.12968950074752467,0.058497392540793364,1.2759363121150216,0.3139425966623566,1.0298254155633035,0.1806955675901718,0.2256921435050382,0.1577259333496748,0.10953421006081382,0.030653417509291486,0.805985335126552
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Discussion,1.580312832117987,0.3447065607059592,1.082012944910684,0.22334910269932862,0.4823740253511357,0.4025567405808039,0.09695761257720042,0.044077589077912316,1.3790840609305492,0.3008134300413532,0.9442350753090899,0.19490899605173095,0.42095103972362935,0.35129727056072657,0.09636182733730314,0.027747737290280794,0.7862643444509796
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Documentation,0.44007954021468276,0.13550719248128396,0.4582591799910557,0.14034732034157188,0.44621332503344624,0.428861741387722,0.36955668050255464,0.13024252900842503,1.2110560254309686,0.3729025936166839,1.2610846231717332,0.38622215399933335,1.2279356037450089,1.1801857358127872,0.36588681077104906,0.25796907452447415,0.6524445804993364
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Factual,0.9436485249890707,0.6212387492678674,0.8110954396234713,0.30309746312351205,0.3375924114112121,0.5002114533389402,0.11795481584109557,0.2417909412264,1.183694390376429,0.779269826762417,1.017422373391964,0.38019957360796297,0.4234693670778869,0.6274555362932424,0.12920844042804958,0.17081818588056985,0.5584021018612886
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,General Character,1.3636956618847647,0.4533621774954846,1.025648253391287,0.24761288248570112,0.40283436591913546,0.36854763663518425,0.1587779464004005,0.07680434510783538,1.1674061347952907,0.38810550043174064,0.878017065403147,0.2119716342491228,0.344850631430775,0.3154991131801198,0.10561492767307296,0.037360923861206125,0.7505246063579265
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,General Explanation,1.2057268443740905,0.47718590184106424,0.9436485249890707,0.414629586890751,0.7073893620688885,0.8402330864605537,0.15725790854776756,0.136705224158512,1.1219124352381666,0.4440149936893394,0.8780521223519522,0.3858071931639525,0.658216183510672,0.7818254628708983,0.11504224137758179,0.12727961819532646,0.5331349765917943
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,GeneralExcerpt Language Translation,1.1161492938030388,0.5274924807259305,0.7726746955359263,0.1330032625635189,0.1255560592604611,0.18423911515030067,0.09331304182808137,0.05472770148561951,1.0066578967218498,0.47574681463522883,0.6968772799274368,0.11995598195752,0.11323933029170864,0.1661657281698953,0.06570828997332362,0.025396719418684183,0.788055315589753
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Human Decision Making,1.5014672360318158,0.7694299493941759,1.1612952225186057,0.22285313660012185,0.42920877306063643,0.9397617702524286,0.1328062561051046,0.13259094958323636,1.213511194257916,0.6218662881081669,0.9385784241879416,0.18011367111443644,0.34689378381597347,0.7595313442542497,0.09755938312873808,0.09322433467401309,0.7264295013588346
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Hypothetical Scenarios,1.3360719407207715,0.2772089003240914,0.8780672506231865,0.21985613251375746,0.38423346825091903,0.13103915969251,0.14856338339651742,0.05726167692411238,1.4375495039774644,0.29826351786420746,0.9447583637683931,0.23655468287961634,0.4134168339750256,0.14099186823425724,0.09413149043044988,0.07976124371317977,0.6793984880577721
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Idea Development,1.2362656315905654,0.47363750298551355,1.0301731608153442,0.37412424282354617,0.5513639219265355,0.6639409966199948,0.1644507438468506,0.12373889614158498,1.1824832094288498,0.45303240689106483,0.9853565725200308,0.3578483653306319,0.5273774205986292,0.6350569493260936,0.1432133594947027,0.12324492670792891,0.6000283240037276
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Interpretative Analysis,1.0952666628854097,0.27108379129715066,1.051660205229966,0.5341434391350234,0.8084449852215868,1.2443167664021815,0.1806275524774959,0.22961638621028324,1.0779624701154962,0.2668009199742466,1.035044953861131,0.5257044704794567,0.7956723077155541,1.224657720872635,0.17676941144803998,0.25976980969121155,0.47005832902230865
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Literary and Cultural Translation,2.550821608590639,0.2931723572781575,1.623552978563132,0.1827738703091062,0.365354945244371,0.13381541307582587,0.08322538146119252,0.010608439432590322,1.2893417451515565,0.14818729678711828,0.8206432875104813,0.09238512803827603,0.18467280546576495,0.0676385199350294,0.07802169685824745,0.006871840832106679,0.9058969823597289
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Logical Deduction,1.739600369985736,0.4328179024589469,1.356662486647031,0.13954529158639173,0.6774058255290769,0.34869742494447564,0.1241061935363108,0.02327092538842046,1.5246605695816544,0.37934022150938396,1.1890373417420497,0.12230349419521591,0.5937075949216783,0.30561341771378203,0.1328337879845044,0.0772703030850157,0.839618202015489
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,OptionBased,1.586906433903362,0.4469767947138581,1.1192157042239859,0.24450309310563967,0.08106659879282396,0.09938987112270882,0.0867163103398042,0.013116864819748941,1.7242348752988454,0.48565747887178823,1.2160709093972155,0.26566201462439687,0.08808197753451065,0.107990917662174,0.1138690444985011,0.028090149184976765,0.7931712569140235
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Paraphrasing,2.4564289935578985,0.0,1.4607565648914693,0.0,0.2554153112650618,0.0,0.07166667065171639,0.0,1.6225922348691992,0.0,0.9649015971734163,0.0,0.16871438246831638,0.0,0.09636112075970871,0.0,0.9999999999981155
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Personal Opinion and Advice,2.0008458133557614,0.0,1.4808689782906026,0.0,0.29428285863148423,0.0,0.09988627798455696,0.0,1.5238416625541762,0.0,1.127827956977122,0.0,0.2241254561270191,0.0,0.1195700491887377,0.0,0.9999999999983966
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,PostQuality Assessment Rewriting,1.5388772503719972,0.44975304809717387,1.5468512670340766,0.6029058371692904,0.8649417415720652,0.8034477291316182,0.21716694050812313,0.1524895713849812,1.20003282933037,0.3507222052165252,1.2062510522417875,0.4701523772758116,0.6744907594765952,0.6265370752448296,0.13576043524314318,0.16878144367043446,0.5524084618844555
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Problem Solving,1.5175000993204648,0.5604040970059581,1.1454590105112747,0.59806860123961,0.21876876660529204,0.7518094162019427,0.14068699436276422,0.004251132349271547,1.145670134027057,0.42308942003603944,0.8647895171029029,0.45152506734357745,0.16516430033174548,0.5675950828659477,0.10260381863912049,0.07720748631759555,0.5398402723045107
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Professional Content Generation,1.10078446648475,0.9202585902346344,0.8820156998794579,0.31989765200005005,0.2980308006989607,0.5391484070399457,0.13142516521837755,0.06839549264000566,1.1960446606997763,0.9998963528510931,0.9583439815997956,0.34758110265390063,0.3238219276825246,0.5858054807260947,0.11782504852536857,0.10777959062875869,0.5948613132301117
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Quality and Compliance Assessment,1.809145517237799,0.3076609296223376,1.406457675802783,0.2757725747889732,0.2176582652519657,0.14325467457909988,0.10277081172690505,0.008381862604370238,1.4672460785743318,0.24951795652575637,1.140659769952134,0.2236559884664504,0.17652434982003776,0.11618184248359628,0.12419131174856568,0.025051683257643287,0.8196676385902794
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Quality and Optimization,0.7769345093209515,0.3891613180063047,0.8252702374967241,0.3503959521880181,0.374794206747645,0.41282887809907276,0.15950910941080731,0.13356616655480363,1.5210011203829588,0.7618593248157275,1.6156277534232808,0.6859685462053104,0.7337328970937982,0.8081931985025762,0.22373140546456172,0.1848092378963644,0.5707101656373013
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Specialized Summaries,1.312820818635501,0.07516706035327747,0.9833238849718886,0.030112709440285435,0.1349259144291522,0.08162184946948713,0.10688828025645902,0.008139236569024932,1.0595459443152393,0.06066551719235558,0.7936169349852016,0.02430323979243354,0.10889544360289852,0.06587502143879048,0.07350880884610689,0.009966115894589511,0.9512660125852159
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Specific Character,0.9816831963404984,0.5558753336744238,0.9466175737462279,0.182725671465646,0.11715789277593051,0.5211027600483923,0.15147936253206518,0.1701823724047698,1.3776169389223052,0.7800716956892493,1.3284086038506633,0.25642282675857686,0.16441016635328415,0.7312746024765743,0.17029909801936682,0.07646635457805417,0.7633867469543741
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Standard Summaries,1.1132776067096715,0.4404178960957743,0.8257796992720999,0.23576946267062493,0.43809278388724726,0.6157903520031098,0.1291775014371237,0.1049022242149294,1.1402647325046993,0.4510941309294618,0.8457975461136454,0.24148478480534719,0.44871265533476695,0.6307178162696271,0.08018857243386801,0.035119626799866355,0.7077440338259872
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Supportive Conversation,1.696290817206008,0.0,1.2444555790713472,0.0,0.4086644980240989,0.0,0.10930094206164132,0.0,1.553973808598038,0.0,1.1400470698920957,0.0,0.37437797811068935,0.0,0.13702966487968382,0.0,0.9999999999984339
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Technical and Practical Support,1.3936791984245762,0.6381218401551572,1.2510723163015833,0.44952063929886343,0.3331504059979067,0.6414533442151361,0.16085388038554715,0.08513057547793823,0.990911212341969,0.4537070560892462,0.8895171766036573,0.31961088474585586,0.23687120613752247,0.45607576815062134,0.09118598492447028,0.06321368751418044,0.6229430540076174
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Technical and Scientific Translation,1.9744714062142603,0.4902863474935861,1.6239867681542748,0.05746266117342347,0.49278497553857026,0.38395584291258744,0.12898017217670715,0.029111901640214433,1.23094130905762,0.30565837342460034,1.01243927462941,0.03582384791399362,0.30721608880445384,0.2393689300374984,0.10805698624161181,0.0160505514018065,0.9444498555943626
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Textual ExpansionReduction,1.1941359864987469,0.3716459787899238,1.3253332383978065,0.2330858715638615,0.2277221837664858,0.048202699367822094,0.13393648442451433,0.13367668294519794,0.9888048327193952,0.30774161740631234,1.0974427752854368,0.19300685922440397,0.18856545516723833,0.03991426656923103,0.11774175617890414,0.098969359549968,0.7476065398427625
rb_Skywork_Skywork-Reward-V2-Qwen3-8B/Skywork-Reward-V2-Qwen3-8B,core_eval_runs,Skywork-Reward-V2-Qwen3-8B,Tone Adjustment,1.6252187305931214,0.6596378038758552,1.4822455372598298,0.07798572871883902,0.18600897668216457,0.11160538600929876,0.21072305846917228,0.13395792085489122,1.166546689113225,0.47347368181295235,1.0639236377199959,0.05597646145872459,0.13351320152132914,0.08010792091279748,0.11944305047669651,0.016336257855528558,0.9268227523689058
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Analytical Reasoning,1.2768149389425805,0.551030682908509,1.1575929199266377,0.41383721984323163,0.3450142852387913,0.2779489745026019,0.143069683364908,0.14538697042948032,1.1071875391119417,0.4778251625798695,1.0038044020446875,0.35885812348860746,0.2991784523984031,0.24102290135568588,0.11025271760977462,0.07227814959168608,0.629593226551741
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Casual Conversation,0.6884373488690477,0.5975739149581534,0.44312661758856664,0.2515625161787197,0.13834267867144154,0.12069883907092278,0.1018908284910241,0.22383437526666955,1.2208390638489868,1.0597065660608966,0.7858177448857417,0.44610790982745274,0.24532972622281474,0.2140410568094179,0.10491977366556388,0.07737750066116666,0.5264767321956837
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Content Categorization,0.8191047233110859,0.8620677067415268,0.8093675910823882,0.3256700225467989,0.34557589286533297,0.2778085725959665,0.16876380236966304,0.07470475117079489,0.9936815885824384,1.045801329087448,0.9818691685757052,0.3950805020997026,0.41922893669824735,0.33701827847248056,0.10361109096852072,0.04919683125256302,0.6468730484555733
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,ContextBased,0.9910736586217124,0.5981355225846949,0.7860920750046039,0.3552460741848482,0.20526758750099547,0.2785573827646888,0.12041913139359278,0.08309735299109791,0.9027527220361597,0.5448318260327787,0.7160383633526957,0.32358781577247064,0.18697487492506298,0.2537333459995383,0.06582493547407225,0.049537842492670264,0.5685225739873797
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Creative Writing,1.206707586895957,0.6391094790044676,0.905865301505856,0.34987700127369703,0.12661911946738333,0.3359232617841474,0.10522091756863639,0.05943084324081177,1.269884610612963,0.6725699752765524,0.9532917652648703,0.36819476760544145,0.13324824751835915,0.35351048184250683,0.09363637607624964,0.0480391186239181,0.6435891701817216
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Critical Thinking,1.3175314918668546,0.6948958365742773,0.907089918135954,0.35242958593738827,0.3564336403118061,0.28941513021116183,0.11223614641542412,0.08487106258078203,1.3747657008966918,0.725082449805132,0.9464943455093663,0.3677392986192902,0.3719172910638393,0.3019874642776762,0.10304386030081236,0.05447122381491276,0.6389858423284748
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Data Management,0.7987464468489489,0.6722911296059735,0.8350169393964343,0.3360948641144794,0.504089645456731,0.38755606294932593,0.20077075338034672,0.23496610888076216,0.983555518795886,0.8278417429837057,1.0282180562697718,0.413858439987857,0.6207228272660976,0.4772264165435479,0.154937655451105,0.12525428137519276,0.5842519283723815
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Development and Implementation,0.9231425361279378,0.4532173546191609,0.8024462970923232,0.3044855848655248,0.2544082548233952,0.10614384141638344,0.10523294344921041,0.07585580555646729,1.241770602079134,0.6096479853249344,1.0794153475540937,0.4095805720629029,0.34221875756158027,0.1427800114329772,0.13139329330333266,0.09493451605375636,0.6080014846783945
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Discussion,0.8609444914884434,0.34012946890376705,0.6837247848731827,0.1856317958500865,0.32591962593637314,0.2882919149580785,0.11913385791914582,0.06889268314679622,1.0054235484427068,0.3972081602658689,0.7984637873423797,0.2167835217398466,0.3806136981601984,0.3366715078499171,0.07956658898852731,0.026678379583751965,0.7407863310207311
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Documentation,1.2165357203604368,0.4589270321556683,0.8124947835491615,0.3286503130186669,0.26235266270718316,0.20490488257552064,0.10269437455072589,0.016508183610130456,2.0060312130245554,0.7567570237331229,1.3397797277471986,0.5419345893850143,0.4326117362626915,0.33788205579844427,0.21839038963057772,0.14264893220746946,0.6067723387498607
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Factual,0.9947475085120061,0.6132287275480037,0.9416079868858047,0.3087671930090783,0.30102168782635724,0.19525225149433495,0.1603400852193817,0.09920615429451435,0.9731401773469339,0.5999085271125326,0.9211549217348776,0.3020603302774121,0.294483068487202,0.19101109479611422,0.09585575266287572,0.06849176407502605,0.6379293000922337
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,General Character,1.0592387842932125,0.6004989546797248,0.7630635622810857,0.3401717194775229,0.12598731088752388,0.30532734629651037,0.11057437076315324,0.11366603745750944,1.6295633389446817,0.9238248222506611,1.173918879119877,0.5233299339357451,0.19382249407675003,0.4697243504296869,0.14971378485279307,0.06111396939627595,0.6530038031583636
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,General Explanation,1.3100433901796318,0.43936436649779864,1.0277107561476349,0.38466352366910517,0.17222633880612478,0.5045108511766373,0.12560680146179315,0.058144949989570316,1.2378930854490462,0.4151664863602329,0.9711098490427296,0.3634782784630963,0.16274101723537046,0.4767250450536124,0.10554158126784563,0.06967053295373093,0.6201989936351953
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,GeneralExcerpt Language Translation,1.3460623293110623,0.6884636742265418,1.0498366851856396,0.29177261936834464,0.19768588454268235,0.36527896042983804,0.10376580406065022,0.05111029558609065,1.0336450495540828,0.5286731922928879,0.8061725440587935,0.224053015256638,0.15180354688987308,0.2804987413530917,0.0761094097056117,0.025244766000089514,0.739018037782758
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Human Decision Making,1.1278836164790507,0.3697250208066265,0.8926181216112726,0.3062003081511926,0.12617451342970448,0.2456097353409084,0.128528196092158,0.08541295286874326,1.2744317748835592,0.4177641270793315,1.0085977670014437,0.3459855223406957,0.14256861855770603,0.27752229606485207,0.11444196819682978,0.07759115039049119,0.6340858655353303
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Hypothetical Scenarios,1.0824050988880582,0.11859281047139136,0.7849181590630132,0.11356434218559663,0.10483342362111943,0.034445267761224946,0.10230720308009822,0.014558568160779939,1.2055349524864525,0.13208343002424217,0.874207149027523,0.12648294432602925,0.11675883317060072,0.03836361661319737,0.0825966306839857,0.027605925422195365,0.8171399862697155
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Idea Development,1.1554140903384806,0.596661302565023,0.893663335805114,0.30207080207316084,0.15462929984115117,0.29638842490738815,0.11545535061799367,0.08086357136160063,1.2397132124745522,0.6401937680588976,0.9588650979469404,0.32410991657820076,0.16591106829293262,0.3180129526329123,0.10851994506703383,0.08362344394450419,0.6250625901577923
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Interpretative Analysis,1.016041797685046,0.1080158668381892,0.84311864941636,0.2972542366649732,0.2197757845199897,0.16923109813123566,0.15309579649064303,0.11543789271428478,1.152644394540057,0.12253815120213951,0.9564724476848694,0.3372188333450562,0.2493237253228976,0.19198351592154975,0.10553727050287354,0.10235588871136667,0.5962253029032601
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Literary and Cultural Translation,2.0727065470232757,0.2557186726186593,1.4669191205269498,0.10820696943333186,0.15051084391317862,0.11082390497089767,0.10201368843517533,0.017418349570334835,1.2343738328801426,0.15228997974297998,0.8736048911171239,0.06444127452368498,0.08963480506102191,0.06599975696035443,0.10223233326279779,0.021507845174324802,0.8965238537485709
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Logical Deduction,1.549662644170762,0.6098122811532083,1.2526866112837514,0.34494148425016546,0.12355367783917648,0.1456435778164838,0.10064411320715994,0.010435339957842626,1.5823818922905641,0.6226877282117071,1.279135570484726,0.35222450552737605,0.12616236396614788,0.1487186654025198,0.12281544163412672,0.06498057506244947,0.6961354480699827
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,OptionBased,0.8088085834911545,0.6148667497920834,0.9586850187891655,0.511910551663386,0.08498995414997897,0.2800550031021333,0.1351059103122827,0.010736380016506453,0.7763786669291673,0.5902131076331095,0.9202456700938759,0.49138503200830685,0.0815822045563805,0.2688259427672803,0.07169078284097874,0.06709121992522515,0.5066352848462785
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Paraphrasing,1.272041274116976,0.0,0.9398035623820087,0.0,0.08985722024667381,0.0,0.09403295711341186,0.0,1.416305534809476,0.0,1.0463882061997458,0.0,0.10004807309912414,0.0,0.1010423482233288,0.0,0.999999999998257
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Personal Opinion and Advice,2.0663416605891363,0.0,1.2628475492676634,0.0,0.025459545736557575,0.0,0.07754554621147114,0.0,1.80704587453948,0.0,1.1043785728182989,0.0,0.022264743516703142,0.0,0.12090517826912095,0.0,0.9999999999983678
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,PostQuality Assessment Rewriting,1.071172946357224,0.5898752104109775,0.7375260154852862,0.16085898443927127,0.32161396746622,0.23550079806315757,0.11010013029195997,0.02830154994613021,1.1820631319219228,0.6509403929895106,0.8138763350056619,0.1775114612357409,0.35490815460361125,0.25988036000660236,0.0872253843383225,0.008665552609102067,0.8723964889311491
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Problem Solving,1.1330433865479024,0.5591739934933637,0.7425804841241616,0.5503650738696446,0.1905721879398207,0.1632406167814574,0.09187274303473836,0.03136543384949664,0.8897897388256523,0.4391246509495619,0.5831555109632419,0.432206922657439,0.14965815020644593,0.12819440764245665,0.05472791839742097,0.04824637601082804,0.44696394367334025
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Professional Content Generation,1.1307033547706453,0.5057276677008109,0.9169362518475903,0.2654818052004375,0.15837335068476258,0.2629025701748386,0.1032060903330862,0.09002093332274533,1.1164494895145616,0.49935236687476625,0.9053771760501146,0.26213509019928105,0.15637686558432434,0.25958836950943914,0.09291885180278725,0.0534425285603187,0.680336789233799
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Quality and Compliance Assessment,0.8701174160552914,0.24935378618451987,0.7807594025877655,0.24238569155890966,0.11082390497089768,0.046800635545142605,0.10892963542249001,0.014096952407637309,1.812157832397234,0.5193188968918061,1.6260555650426125,0.5048067321888738,0.23080839861858057,0.09746976293014381,0.15155100966843882,0.06340860178925328,0.7001729391254358
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Quality and Optimization,0.6669090565182821,0.569657335855476,0.7317175366070724,0.19032518458555459,0.2740645217523551,0.220196990239896,0.1833804061548071,0.2076175486574453,1.0625130800952334,0.9075725762048549,1.1657653258435912,0.3032242494193027,0.4366369541780833,0.35081572223495255,0.15866680146560097,0.04890200683030632,0.7541829824468377
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Specialized Summaries,1.2570182701069852,0.3687422074601787,0.8494497353914946,0.1379812737636208,0.08227551728836069,0.0448350088522466,0.09974931359129174,0.028416011749442138,1.0449412994625293,0.3065301202005014,0.7061354090362935,0.11470185830774071,0.06839446012342704,0.037270701250422686,0.05829105999846079,0.013240824207226887,0.8274415714186378
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Specific Character,1.1941650165698587,0.09846853718697979,0.8699874142898881,0.09966455342868918,0.32797885390035936,0.11845240856475592,0.08787829457887042,0.02772886295117455,1.7561536957964086,0.14480903652436283,1.2794141443618232,0.14656791265451719,0.48232971861346724,0.1741975624729859,0.14321954596720188,0.03023461921091697,0.8619821477306013
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Standard Summaries,0.9140164121966351,0.45831862389358147,0.8380303803184799,0.2242452452145508,0.14227393205723352,0.22333263282142052,0.13405134957932185,0.062493295193158216,1.0592477108859684,0.5311424901539317,0.9711879897997692,0.25987636491091237,0.16488034004574215,0.25881874430864527,0.09373965850717536,0.02977725389124014,0.7479053174407019
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Supportive Conversation,1.0428117612168675,0.0,0.8172326978892802,0.0,0.14526917273212264,0.0,0.11155477131987934,0.0,1.2424771083546209,0.0,0.9737068156399062,0.0,0.17308360759055486,0.0,0.09807609975060239,0.0,0.9999999999981339
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Technical and Practical Support,1.1326689814635413,0.2738305185746295,0.9257919721068545,0.21650754013775408,0.3070121691761355,0.3424870509193536,0.12402680314550296,0.06201481095022143,1.2153086182886061,0.2938092193044637,0.9933378426149484,0.23230395089825095,0.3294118062958952,0.3674749387916709,0.1109885082672658,0.030615156328648196,0.7763735325274445
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Technical and Scientific Translation,1.9420391725812376,0.12916975410459353,1.5089616914583361,0.19607906272229947,0.5724653739881843,0.2235198353636011,0.11569058417336497,0.020853811418566348,1.2736015906097924,0.08471034292662005,0.9895866352974855,0.12858989133389964,0.37542641836174523,0.14658572384693386,0.10784915798092692,0.03022016297157465,0.8373200514460938
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Textual ExpansionReduction,1.1766615788759753,0.13015256745104176,0.9396488602811788,0.22437979704174305,0.09397567617464636,0.10270399470381546,0.1055662182498866,0.02981825874337804,1.2047489822933046,0.13325936360503077,0.9620786710977798,0.22973583652782037,0.09621891482161148,0.1051555819601725,0.10724214668230464,0.050511851145400744,0.7283945996382275
rb_allenai_Llama-3.1-8B-Base-RM-RB2/Llama-3.1-8B-Base-RM-RB2,core_eval_runs,Llama-3.1-8B-Base-RM-RB2,Tone Adjustment,1.8341169070141388,0.38460762290998174,1.3838778928403643,0.006607989735443631,0.036691698267391805,0.011232152530834226,0.1057515436604392,0.028661348559542665,1.7005463757004482,0.35659836987768023,1.2830962552506002,0.006126759397015569,0.034019605984923484,0.01041416509742555,0.13564387825679175,0.0018833727147889445,0.9929190660446057
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Analytical Reasoning,1.1818692437192007,0.46383826502642256,1.0016756589190783,0.17550409699097091,0.12163872528480339,0.3783110363105451,0.13452226480499802,0.114025362774924,1.0338269094923596,0.4057373373449893,0.8762045854711077,0.15352024698134148,0.10640211521204124,0.3309233500865226,0.10235804424445144,0.04944345419786947,0.7732545061988612
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Casual Conversation,0.5266715138490361,0.30965513906939357,0.5171950605454831,0.16607064854855802,0.07813878379222469,0.0858922455860408,0.13499244643139457,0.0357629098547361,1.0953354658365573,0.6439996223119802,1.0756269850837878,0.345382399477664,0.16250770906415732,0.17863282968005953,0.11105106220621264,0.06340141564544019,0.6629701887863859
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Content Categorization,0.9611875960915722,0.7117879315341209,0.7417187554746937,0.18998554693857517,0.06653879939420371,0.09376654055066963,0.10646039824863418,0.07741153580408944,1.2339809478802106,0.9137995017993246,0.9522249524056439,0.24390508808903544,0.08542308606679969,0.12037829561955793,0.08639832469317865,0.03307344404670254,0.7357120653297239
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,ContextBased,1.1821310489226282,0.48413823772295916,0.8048395424908155,0.17296792900937008,0.12225296056976803,0.11535540040254204,0.0983104419251214,0.04117109780103706,1.212988146155535,0.4967756696635217,0.8258482217249505,0.17748290068571748,0.12544412240817007,0.11836651563363777,0.07782080130569641,0.02561064456080686,0.7710375864323746
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Creative Writing,1.2327400086313551,0.5051229317207678,0.9724027044640415,0.3198300192885005,0.19526640403335327,0.27880240278854607,0.1059388590867329,0.061655265753957356,1.2839572200858735,0.5261095045774336,1.0128035631891303,0.33311814299074705,0.20337922639274372,0.2903859333932698,0.10969759833849818,0.052009615309091206,0.6781535281236764
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Critical Thinking,1.4648202520389833,0.8778932636780049,0.8900392349959944,0.4546459934394105,0.26663853026006573,0.4143369427168121,0.11395100031149485,0.013738614666707338,1.744160974164165,1.0453072094391263,1.0597694133420736,0.5413468292190249,0.3174864069777489,0.49335085628098235,0.11675762446317728,0.09900922632659803,0.5285768975652348
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Data Management,0.9810042361048581,0.428635534596317,0.8544068755141104,0.2930232362849817,0.3039357023175915,0.3127162460633157,0.15967672539117783,0.16317187569008137,1.1276700259343735,0.492719018557793,0.9821456299670026,0.3368318996999111,0.3493758423263583,0.3594691279912332,0.13003283022510764,0.06181928106834525,0.671351150439361
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Development and Implementation,1.0789596599103686,0.6259561025334868,0.8175930361398356,0.2849035688687429,0.15221203485814647,0.25238525081961544,0.10626300820908308,0.057279377051817154,1.2145911847848234,0.70464243701437,0.9203692513577363,0.3207176098599065,0.17134597578210006,0.28411154949080936,0.10410873774368395,0.051759257429092514,0.6638228221736965
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Discussion,1.0572701057494855,0.20662472208974902,0.6888452926388839,0.15057743057520767,0.17538934743466106,0.2643829777382283,0.09283421784924145,0.035052966752967624,1.4109854312146068,0.27575211955012224,0.9193021414194095,0.2009539091817364,0.23406678451871482,0.3528337061846008,0.09762243759892258,0.027292173897172256,0.7824318607828084
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Documentation,1.2057539338165146,0.4910660061828883,0.8358164684194427,0.25533391583514253,0.07024434996579375,0.0515554862134266,0.14734371760407966,0.05788658796672114,1.3724548243400867,0.5589580845254654,0.9513718447629136,0.290634975161453,0.07995594647411516,0.05868326346724049,0.11905190034959029,0.038890989369790574,0.7292598570017057
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Factual,0.9666653665017488,0.4683695089318993,0.7482805280920107,0.2916263824538736,0.21331082420805256,0.28364202301536795,0.12558994952264868,0.16278599592262244,1.118790395150894,0.5420772545838404,0.8660381314139755,0.3375198977519226,0.2468797471966306,0.3282790322354932,0.1023763554331325,0.06800215045683339,0.6118718322113034
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,General Character,0.9966319928633031,0.6596685571902351,0.8216208085002594,0.24567173747274773,0.12357205601780688,0.16977060499186963,0.09295727828820238,0.08817204459237585,1.223398281226277,0.8097646722407035,1.008566343581978,0.301570071523137,0.1516887296638465,0.20839895551926763,0.10650818633367254,0.05496387137672626,0.6847984506236575
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,General Explanation,1.3836203612528364,0.48039240942776495,0.8933778107525235,0.2818075684996846,0.2959607130439521,0.5546242540303785,0.1169436406748593,0.07404705251501953,1.3508690863770987,0.4690211805199016,0.8722309246071455,0.2751369835643491,0.2889551131433082,0.5414959047337171,0.10027191354381104,0.07611927965995696,0.6403862212330277
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,GeneralExcerpt Language Translation,1.348417630822731,0.3820744861098162,0.868292620726672,0.13437008216906476,0.16916643913780605,0.022716636112791122,0.08789111341163791,0.040280900798917496,1.2821145608538187,0.36328749400380334,0.8255977871161054,0.12776296820367716,0.16084835281635818,0.021599635949625218,0.0774624337120276,0.014263373068022334,0.8452120844573394
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Human Decision Making,1.1994706089342533,0.3148509654143403,0.9648271359494773,0.27676949657218775,0.13960259001229422,0.5749645044505193,0.10157935078182728,0.0703798980553079,1.2588577849845204,0.3304396005783745,1.0125968424799416,0.29047267420392686,0.14651447557946096,0.6034316615397591,0.1104405863473093,0.04941322919584562,0.704981741945537
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Hypothetical Scenarios,1.2643982993842875,0.4465188438765997,0.8997349781280594,0.1910417183575308,0.09247765339533397,0.061544361667278016,0.09399207893542438,0.030557830084438964,1.7956675504803694,0.6341351447701762,1.277781617583215,0.27131277747659,0.13133450229048574,0.08740379769157762,0.1388815319020048,0.060522284830011375,0.7516110498831992
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Idea Development,1.1788484144488827,0.49102572845928427,0.9012386798092843,0.35129111701962035,0.24956077545186814,0.31545513126840397,0.11360371370045369,0.0559821661137804,1.202692551989825,0.5009575270366257,0.9194677063534955,0.358396554502898,0.2546085503665763,0.3218357273194038,0.09942104034011645,0.07799479840571061,0.5940441002055789
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Interpretative Analysis,1.1182908570099084,0.2630538128592883,0.9171864207052518,0.18679619455387142,0.15152983091459965,0.127249915654419,0.1148956969262454,0.1009842958295262,1.2694911494755061,0.2986204216616375,1.041196068273067,0.21205227088764733,0.1720176625086185,0.14445494272100468,0.11621661925571664,0.09276512662246245,0.7228728707881794
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Literary and Cultural Translation,2.0647972228477354,0.10939429730911443,1.3706419836470258,0.15337757148494413,0.20493305769837075,0.10278875063801929,0.09530915233239046,0.00757338357570192,1.2909608084605801,0.06839594170917107,0.8569582832345092,0.09589534095306429,0.1281290690045145,0.0642659953025788,0.08568769458281822,0.0036702472737979996,0.9399283429792745
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Logical Deduction,1.49518965563658,0.6837546359055702,1.200917250507039,0.27816355333915654,0.1623997815722938,0.3177106837902414,0.0974662761125138,0.03986921586157455,1.250671627037471,0.5719358208229111,1.0045234903591844,0.23267366954133462,0.13584149561486672,0.26575340213543375,0.10105854212710996,0.02896758148426004,0.7739775299794758
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,OptionBased,0.9592542653585687,0.8511589246356909,0.828033917158579,0.2882492072122237,0.1332588485446265,0.07689017436049327,0.10833499706492278,0.021090410069723453,1.304134112254483,1.1571753482402503,1.1257362270539935,0.3918831925289952,0.18116928579081107,0.10453443148697548,0.10373971624771439,0.08362861938593646,0.614929297358813
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Paraphrasing,1.6214200414122666,0.0,1.0020829114577434,0.0,0.024488855951377637,0.0,0.07077870109542672,0.0,2.5751422622809006,0.0,1.591509904710866,0.0,0.03889324561473539,0.0,0.15785010931373467,0.0,0.9999999999988567
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Personal Opinion and Advice,1.4735202403374992,0.0,1.2284884711397963,0.0,0.1978441783440246,0.0,0.11611308036156986,0.0,1.1004089343868961,0.0,0.9174218666476557,0.0,0.14774788666379932,0.0,0.0859510306918474,0.0,0.9999999999980067
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,PostQuality Assessment Rewriting,1.3062871319326965,0.4643215977096734,0.8883990365847773,0.23886032688100856,0.157727565634202,0.12469983227872558,0.10532371662601725,0.02492433942216865,1.2328158426141196,0.4382061246193749,0.8384315975343034,0.22542577964096977,0.14885627897375714,0.11768616948487029,0.09849058716301029,0.012082377758571461,0.8449500349253455
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Problem Solving,1.1997928307230872,0.26518853221031313,0.8240262836599571,0.7009666497924487,0.1939775168780176,0.5173270819728526,0.10707643622953089,0.09703664841610715,0.9513130086784478,0.2102673844883476,0.6533685675269717,0.5557948635144104,0.15380433227458723,0.4101874675529067,0.04798526740772313,0.09441393857547475,0.30489725679193563
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Professional Content Generation,1.25473164571927,0.5354520575947603,0.9388916384253143,0.268267540414726,0.13919981277625182,0.17669837345179878,0.10180353281592502,0.05491537913198888,1.2325044661669928,0.5259666914876734,0.9222594660411961,0.2635152646518655,0.13673393153162322,0.17356821690660623,0.09864354662739294,0.03999214688960204,0.7151889171229802
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Quality and Compliance Assessment,1.5355479346880279,0.5088687600159623,0.9626398317926361,0.39619854118703635,0.022877747007208054,0.06283324882261368,0.10125050953302805,0.019425095124033454,1.91042241562301,0.6330992760198593,1.1976498234174953,0.4929227912866352,0.028462908720854836,0.07817277747277032,0.1279122042872356,0.05675757222249728,0.6524354888229986
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Quality and Optimization,0.850343300732705,0.5253423489700961,0.8734671893841611,0.19823352967219887,0.3542023013756824,0.6189075009027447,0.15551329849122997,0.3178460252972594,1.1351920371548054,0.7013219846821773,1.1660619860831603,0.26463796937903716,0.4728532955067904,0.8262296723626723,0.15878512535659117,0.12431895194336351,0.7033536087648773
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Specialized Summaries,1.3646696922970416,0.1740601825557213,0.9065038733448829,0.11711419433326054,0.15591506807201122,0.007773600655618246,0.08575693428445541,0.0053563589710143855,1.3406915999651674,0.171001837263683,0.8905760384285242,0.11505642535007954,0.15317554368800562,0.0076370136739305206,0.08317341880787393,0.01138452769303594,0.8755322170093405
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Specific Character,1.2337872294450654,0.35847174007773197,0.9883840101296794,0.1946650352434287,0.1757518469470992,0.28250795336013607,0.10155895206107246,0.027128639899158313,1.8619385341582542,0.5409784850485915,1.4915945238256758,0.2937737737292352,0.2652314179224191,0.4263396734978452,0.1690764534394893,0.03724044861089004,0.8122444992910851
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Standard Summaries,0.8940043531197007,0.32161762297985264,0.7340794138977561,0.11200339873814469,0.055462425403037835,0.14894702188847778,0.10396386001295654,0.03564693614137843,1.0321978004235044,0.37133264716082537,0.8475519763604402,0.12931666542119213,0.06403569882785932,0.17197096170328524,0.07847550394787106,0.04326189417266785,0.7874120771602204
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Supportive Conversation,1.3211093342190567,0.0,1.0511501294129526,0.0,0.07636656395363815,0.0,0.10894479026443282,0.0,1.1048390386951819,0.0,0.8790731156187226,0.0,0.06386508589530686,0.0,0.07994604548674544,0.0,0.9999999999979146
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Technical and Practical Support,1.2983926981062657,0.2929801614972385,0.9383680280184591,0.18141086711349486,0.21653304209639174,0.13984425635391962,0.10203163974473939,0.03718467860539698,1.2641211699120654,0.28524684793213684,0.9135994765812276,0.17662246399317372,0.21081757691547534,0.13615301842457783,0.09377962186441646,0.01906702280928596,0.8202101857684091
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Technical and Scientific Translation,1.5423951477007487,0.22321914421469558,1.1587207409033196,0.11053326182658996,0.39862863051115865,0.2607982603374509,0.10783320601466917,0.015970008977707173,1.29189962203755,0.18696682784070884,0.970536564155804,0.09258190379373166,0.33388861334166275,0.21844283837397863,0.09709592061586553,0.008392532664961372,0.9132981980847206
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Textual ExpansionReduction,1.4110092133037193,0.2603753442396064,1.0036145837803603,0.08932536211047437,0.0600138081703169,0.08929571323059904,0.12570087557761767,0.05383352845950479,1.3623593338321083,0.25139791943245204,0.9690111750453132,0.08624553239748622,0.05794460514414938,0.08621690577488536,0.09539594647900912,0.04889889275789103,0.859270650652888
rb_allenai_Llama-3.1-8B-Instruct-RM-RB2/Llama-3.1-8B-Instruct-RM-RB2,core_eval_runs,Llama-3.1-8B-Instruct-RM-RB2,Tone Adjustment,1.8278836526075986,0.6500019035252174,1.3749136600003866,0.11222940152059091,0.034799953194062955,0.015466645864027982,0.12255724423212017,0.0489156203295224,1.4298271509046545,0.508451601103947,1.075500007023845,0.08778931043833182,0.027221600158248236,0.012098488959221436,0.11289957141712031,0.013832180181658271,0.9066703964665945
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Analytical Reasoning,1.125539659787936,0.45974875622291644,0.9940900994251333,0.31704462035545633,0.22418084306507952,0.3120107590681387,0.14493191544637446,0.1436793427839998,1.0232178725344871,0.41795341474810566,0.9037182722046665,0.2882223821413239,0.20380076642279954,0.2836461446073988,0.09742127359898944,0.06626151036495997,0.6477576608080429
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Casual Conversation,0.8615880468413679,0.45524767246943704,0.5776577429060981,0.21094071485988242,0.12744113254253997,0.14447135241577871,0.12081727446141649,0.14992834328065763,1.2493275549230696,0.6601222749267976,0.8376204128794371,0.3058701295772992,0.1847933233285819,0.2094876340633186,0.10040191209657501,0.046880450054477885,0.6637683583000447
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Content Categorization,0.8668952948492017,0.6714004631682363,0.7855942362868624,0.14210884328148976,0.2535816954091096,0.17036937908691685,0.11936151833983111,0.030207854614965335,0.971170359326711,0.7521603046428356,0.8800899500468755,0.15920249794087893,0.2840839346024838,0.19086237067983097,0.0738162529452685,0.021032319256102383,0.8014825382797218
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,ContextBased,1.1176459681307147,0.4905845389519756,0.8583559253401133,0.27423860473390393,0.1275083128970695,0.1179771000981908,0.1124760861812506,0.04854357472064763,1.0856390789505008,0.47653529131945005,0.8337745250014257,0.2663850224001931,0.12385675904475403,0.11459849893438812,0.07867267643382081,0.020261138354746355,0.7518738124496033
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Creative Writing,1.1248678562426404,0.4812632647610018,0.9959487559004506,0.2967169647473924,0.16146798211175264,0.25326993657637087,0.13124681225942525,0.06252948760218305,1.237798473989052,0.5295794802937812,1.0959365968040906,0.3265057349916509,0.17767848974156034,0.27869686137952254,0.11471188320598097,0.05370422006755168,0.6948023941721986
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Critical Thinking,0.9754587477689406,0.3766130674926089,0.9010079859825355,0.1330693533553482,0.3877650063445127,0.429618367216417,0.13768864972491546,0.07587642751425061,1.0663769411585542,0.4117155049679653,0.9849869533171445,0.14547215893812027,0.4239068666919542,0.469661194125961,0.11455363707223909,0.04603364712222746,0.8057392671699706
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Data Management,0.7019003441246481,0.573585866973224,0.7639600627422711,0.37778499145495736,0.29398123142127297,0.5281047669567246,0.18890600983820238,0.25277107787710723,0.887094594034676,0.72492473620483,0.9655285787931018,0.47746240108355653,0.37154727636827545,0.6674435876441988,0.16684856411232868,0.09034753076221755,0.5963459649017492
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Development and Implementation,1.014440148484707,0.4154433124106838,0.8047460024255393,0.1321605524482401,0.17749049666704828,0.5008295430177309,0.13570289992081036,0.056553354894826935,1.4288691204807038,0.5851642615885613,1.1335086790617863,0.18615206882440116,0.2500006434535865,0.7054333069441662,0.14696401502944895,0.07939520956354282,0.7773362165386198
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Discussion,1.1061245373288986,0.29141158286051794,0.768735466276746,0.11291361414064649,0.23996822637952175,0.2549368491231352,0.10271184276265061,0.019876270082790193,1.2943461951960535,0.34099910162233127,0.8995459302352597,0.13212735267739995,0.2808019805186944,0.29831771156133763,0.08743817653464986,0.01858452577259395,0.8405366154708328
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Documentation,0.6989444085253483,0.48504215970328857,0.7850099071615273,0.3853315846137759,0.501971609044733,0.21282736314958706,0.602339338475378,0.32222306087625885,1.8703096646040769,1.2979273143446208,2.1006128645793534,1.031111169915328,1.3432289325184221,0.5695060570420718,0.4947857668019518,0.1506616318072661,0.686707954356948
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Factual,1.016841846159138,0.6793277450027222,0.8802492564329072,0.43868211671502766,0.14833422280122732,0.5434890681439897,0.13156946817411108,0.20481718793051762,0.9923301064772303,0.6629520373082552,0.8590301841549755,0.42810735340385664,0.14475851447553675,0.5303878542151685,0.09632072827227867,0.07795773030867492,0.5396275796109394
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,General Character,0.9101594431662263,1.0678653254243247,0.855381328531222,0.3914851184765862,0.16482699983822974,0.1566981769401552,0.12753131721793043,0.11771622882342,0.9558578252498862,1.1214819944832866,0.8983293450265133,0.4111412750528163,0.17310283246609065,0.16456586783254795,0.10553466397959327,0.07083737670899887,0.5802781988882708
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,General Explanation,1.2698430613173908,0.23758332379372327,0.9576410248509388,0.2684853538168325,0.1370143330629996,0.3266980640771596,0.1325100092981395,0.08731556320449751,1.2301451500990548,0.2301559794373773,0.927703192734796,0.2600919483134889,0.13273098262760671,0.3164848092758281,0.09124332202918983,0.10239595278987362,0.6384963938519601
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,GeneralExcerpt Language Translation,1.3992324241412875,0.6366682198764633,0.9092005902291067,0.1506369827412346,0.2148091836082085,0.14845178842165402,0.09219676220736661,0.028102960829010204,1.607908153642958,0.7316182817396926,1.0447950012475942,0.17310235855808576,0.24684493572466673,0.17059127340659339,0.10926176732889975,0.042534385687104825,0.7926005447268231
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Human Decision Making,1.3014178279462751,0.3227848084258138,0.8920389421226018,0.37036809370280044,0.27936950431109775,0.39008272857578186,0.10161484447310348,0.07357503615563138,1.4462598918125633,0.358709333841958,0.991318941713534,0.4115884288866142,0.3104620978785125,0.43349721566227795,0.10413981820607782,0.059921913931303944,0.6173379781226053
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Hypothetical Scenarios,1.0832160364343248,0.057338432590963384,0.8585500019198653,0.016113954482294024,0.019347942104507913,0.07497327565496816,0.11023982025123691,0.02152506636828172,1.0769943558923174,0.05700909716907043,0.8536187382922803,0.016021400574592226,0.019236813104501822,0.07454265077994457,0.08077972867241867,0.00817066638599645,0.9688337215898499
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Idea Development,1.3343362016657503,0.4979407877729607,0.9843937349213694,0.20491501028277437,0.18958296048236573,0.3165874207204637,0.11700089099293298,0.06551977165693312,1.2889532366117413,0.48100500401602186,0.9509128877289218,0.19794551433482988,0.1831349327216964,0.3058197627395444,0.10687220493037336,0.05662105597232417,0.741641296274126
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Interpretative Analysis,0.9919851149832077,0.2515148498142866,0.8614891424305328,0.3493285129488193,0.2142381505947074,0.24291576443450535,0.1032432071307337,0.057844654654537225,1.1575465544434453,0.2934924560826868,1.0052709193401381,0.40763113319457744,0.2499943086902443,0.2834581909486789,0.1070966840303268,0.09866766491507573,0.5694232600984291
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Literary and Cultural Translation,1.9560232024821262,0.4121514750387365,1.2191890740021165,0.26189141540766214,0.08599085379781295,0.1932778799814905,0.08996021293517809,0.018471521971067884,1.1087665397990372,0.23362696529973537,0.691094077629926,0.14845245091988057,0.048743686321705164,0.10955905120902011,0.06443695565424812,0.01223685217081738,0.8174315934678016
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Logical Deduction,1.7210935026923202,0.48732629175729314,1.2845630234431968,0.31856177669524843,0.05105706944245143,0.21900795576630483,0.11718930372546277,0.037411308347817585,1.3859302828866613,0.3924250857590008,1.0344091077454038,0.25652552438404674,0.04111429076052393,0.1763586682622474,0.1140866753269354,0.011289778490335078,0.868085640222463
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,OptionBased,1.1423347484203212,0.4065755056127842,1.0128856697257316,0.16866747677216865,0.1322109377141374,0.15518661896324054,0.10228990533929083,0.022157219748151435,1.4481112214823264,0.515406322771762,1.2840116318240333,0.21381584176673907,0.16760073417297214,0.19672647151400696,0.12743129582248058,0.03423211199805154,0.8141620400924837
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Paraphrasing,1.369404346730171,0.0,1.0017299986319095,0.0,0.014914038705558182,0.0,0.0820710805263577,0.0,2.8679223281541337,0.0,2.097907631678008,0.0,0.031234240426325433,0.0,0.17046288376236396,0.0,0.9999999999991183
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Personal Opinion and Advice,2.2352247559069003,0.0,1.451722674480369,0.0,0.3724478855117773,0.0,0.08469176855642857,0.0,1.7220423994248986,0.0,1.118423546023857,0.0,0.28693805789888305,0.0,0.12431070564439728,0.0,0.9999999999983906
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,PostQuality Assessment Rewriting,1.0257096529570375,0.3216595374874439,0.8383511086579832,0.21688710929622634,0.24144619417917165,0.03870008298117392,0.11933765874316093,0.015774250652873523,1.0936836146653202,0.34297597242713873,0.8939087862071103,0.23126025672622252,0.25744687654618553,0.04126474438519395,0.09792498878199818,0.005891323749455002,0.9069928397993661
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Problem Solving,1.3227811806866694,0.5584031068495479,0.8835224325952632,0.6828472491316864,0.19724152089873342,0.9394500777411063,0.10696789514530392,0.05834044731546728,1.1904556341066834,0.5025427745401094,0.7951385105106169,0.6145380406312417,0.17751029668548618,0.8454713858480377,0.06473820822630283,0.13008116178775364,0.32747534949356116
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Professional Content Generation,1.0577546820676287,0.512518924705871,0.8113669995886175,0.14311561553895336,0.18259620361129342,0.2126384184024727,0.12760664656222176,0.057101924710778434,1.2057058365076059,0.5842064037292171,0.9248552084320751,0.1631336035434937,0.20813645370769593,0.24238075848783217,0.1075357352771088,0.04897210242345476,0.7754708718774589
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Quality and Compliance Assessment,1.1313171702774765,0.17386275752245317,1.0531341665727636,0.26842283876470074,0.24796268856853715,0.10863063327426836,0.1261725684480266,0.021945496900671713,1.439232690013414,0.22118374119683604,1.339770277799106,0.34148065144419615,0.31545177309023226,0.13819710568596738,0.13245358994839707,0.03589919323514873,0.7689671572209321
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Quality and Optimization,0.8733446088840376,0.515138958532523,0.8175102697861522,0.22927162104165144,0.360624143114578,0.5600490255355215,0.1272313587826137,0.09340396859827843,1.1939515077880907,0.7042477047476216,1.117620248999587,0.31343778257872823,0.49301013029280544,0.7656443649750322,0.15020605578398571,0.09718622854918152,0.6761813549482668
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Specialized Summaries,0.9163400357829441,0.05643149780481471,0.6466258413145087,0.032179389819650295,0.07819793267238614,0.04487647682573364,0.09246104996549265,0.007477329227069451,1.2654819751390316,0.0779329075598818,0.8930018496282899,0.044440312763313505,0.10799274333297894,0.061975216964286886,0.08390938769509321,0.0005000956804898649,0.989411539616328
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Specific Character,1.3710166752388802,0.379972085219086,1.0964878886958276,0.1013564937754865,0.3348940673297637,0.2946530349665684,0.09627236491050561,0.07330755725782628,1.5149688993394623,0.41986789958173265,1.2116154966438595,0.11199859096476805,0.37005683865186295,0.32559063075768735,0.12196719838760695,0.023562068667998592,0.8824596912028406
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Standard Summaries,0.8236311465321771,0.6976008014347574,0.7334601817858593,0.2306506844304661,0.07268914360096375,0.06838960091107311,0.10546202619220679,0.10011987331194988,0.9564257126050263,0.8100754159617122,0.8517164266863321,0.2678386388713716,0.08440885979107982,0.079416099138003,0.07957977662864979,0.02614833052013177,0.7251565685212054
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Supportive Conversation,1.5159246999591007,0.0,1.075520153598774,0.0,0.13866025174897337,0.0,0.09653205611263149,0.0,1.210232505374473,0.0,0.8586372727522018,0.0,0.11069886510493739,0.0,0.08094387622218024,0.0,0.9999999999978714
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Technical and Practical Support,1.2500920370857056,0.5206309525153132,1.037380373435586,0.28957252065527195,0.3006992668742271,0.1884408944553635,0.10872264235298301,0.04906291627213849,1.227594293069296,0.5112612248878858,1.0187107736006733,0.28436112161374494,0.29528762122625585,0.1850495481545236,0.10244132265678285,0.038143711363647914,0.7268222508836089
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Technical and Scientific Translation,1.248882790704174,0.09284324995982596,1.2310799967538455,0.014421382772341618,0.5448830605004775,0.37484958318620837,0.17067378853888804,0.050802072123968256,1.1480299595925567,0.08534574524781657,1.131664820308424,0.013256792074190615,0.5008813337689796,0.3445788147948665,0.12788978400997802,0.021725498016789857,0.9783210293403415
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Textual ExpansionReduction,1.4220065643268018,0.18105105545711409,1.0533245109105973,0.11539718787216025,0.011823742397199279,0.03088616799495664,0.12550358371511333,0.04255621573861637,1.5411211038256791,0.196216817442727,1.1415563568158889,0.12506344626905141,0.01281415950646381,0.033473351324413286,0.13610578783745164,0.020093327446456394,0.8818142454261116
rb_allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/Llama-3.1-Tulu-3-8B-DPO-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-DPO-RM-RB2,Tone Adjustment,2.031198019200683,0.3913255651345786,1.5655191970998152,0.07576497747188937,0.034799423646302424,0.026737781102757457,0.10713230468166829,0.033441488605067404,1.5525868376684322,0.29911752371154665,1.196635914646528,0.057912476118596956,0.02659963558499249,0.020437557843295387,0.1220022937683174,0.005902918781244493,0.9527624697393191
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Analytical Reasoning,1.1122988262323172,0.49594908785043756,1.009775986726195,0.30900621791182714,0.23009291937365633,0.3666247859113062,0.14541940091668626,0.15045132500257224,1.031469258829629,0.45990899747248026,0.9363966445424305,0.2865510661760675,0.21337231273740537,0.33998255439453523,0.10429664712223086,0.06791942096534825,0.6594367913851913
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Casual Conversation,0.8874228107585409,0.44923720490480135,0.5754382288512414,0.1863479598447646,0.09884660020825331,0.11216343384742078,0.1224796083079383,0.15763439986521943,1.4210913524304796,0.7193945201099377,0.921488923841639,0.29841184052064174,0.15828987836478114,0.17961504253336968,0.12146968226401395,0.059267806231616815,0.6775349048011915
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Content Categorization,0.8423926928858921,0.6927499231261754,0.7883016366608202,0.15058852426788227,0.22226756352383628,0.0833331754533469,0.121501577613362,0.030338883975123077,0.954051475368462,0.7845736220191101,0.8927906733320907,0.17054896720334378,0.2517290317179824,0.09437895136060245,0.0800814059739613,0.025165425325616164,0.7885078470070372
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,ContextBased,1.0903329150749275,0.5032724883085751,0.8415155818500936,0.27086714195956085,0.14295003120394967,0.12782701619812967,0.11537159595736074,0.059393910112883946,1.0279803515185413,0.4744919852350852,0.7933920655592994,0.25537713844700105,0.13477518773843883,0.1205170083633489,0.07650453027646359,0.019276229792295735,0.753776547832543
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Creative Writing,1.1340588069726063,0.4421326055148329,1.0006349643226773,0.3212857724130067,0.15231986518202367,0.24365000516610777,0.1267968923071408,0.06437851913835288,1.2514894246757446,0.48791497998525934,1.1042496809787317,0.35455458217994207,0.16809242983811373,0.2688797114512774,0.1149050091639463,0.04778001705894741,0.696031605186873
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Critical Thinking,1.0549128833336368,0.383305149696449,0.9202553649442259,0.1270285591410848,0.3818636367767453,0.43924957967542566,0.13839574228089224,0.09352222344157135,1.1150574431622542,0.40515882109695645,0.972722592076373,0.13427093611449004,0.4036351220921148,0.4642928539039084,0.11153836610742418,0.05419294655696083,0.8065479220754159
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Data Management,0.804501496139395,0.6661162558478404,0.7370249628027887,0.4038371749866201,0.23283865826833006,0.504821256653852,0.15836355940038566,0.2735950006233349,1.075221939753223,0.8902690873178564,0.9850390756937163,0.539731240671726,0.3111905136214106,0.674697180070831,0.19085739415456215,0.11839603472103943,0.5588425351549394
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Development and Implementation,0.999037096827027,0.5005482004990162,0.8376029039251836,0.1281478290932886,0.18753396650621393,0.5668492143713402,0.1439914781835645,0.07081871398744613,1.3873005247911767,0.6950800760462594,1.1631269267903976,0.17795090003824954,0.26041672624223544,0.7871481600357246,0.13919473481977301,0.0734670342440033,0.7888156538783686
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Discussion,1.1526611879840205,0.3543376043576414,0.7652069217356203,0.13649897487380747,0.21138757315369172,0.22184712225558936,0.10193598759874234,0.02535678730748142,1.3536570853509007,0.41612540939622145,0.8986402788304297,0.16030105499103997,0.24824839175429364,0.2605318302000026,0.09399208454165686,0.02650533005104802,0.8037011708819273
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Documentation,0.7560392046484042,0.5308886152851605,0.7778220665461488,0.3793238282991721,0.38248142802804685,0.16337146423308535,0.5409734272413046,0.31248990693058404,1.9715608346271816,1.384424504721865,2.0283650811789964,0.9891815119075544,0.9974157409245192,0.4260318491386137,0.4445848197397923,0.16129046605114594,0.6596395815844985
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Factual,1.0658958389123316,0.5798828936869944,0.860423044960916,0.409763394767624,0.2155405032318857,0.5262895026365821,0.14695203544291013,0.24443514428536328,1.0236016920594773,0.5568734669046529,0.826281942903241,0.39350421393536683,0.20698797739997157,0.5054066314412682,0.09480630461661799,0.0753589907322984,0.5512243263631779
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,General Character,0.9579883003516549,0.9986252359928258,0.8879223826963218,0.3403724712572316,0.16333714249690193,0.18614393619078537,0.12683470339969855,0.11868460173219672,1.003538703545348,1.046107843162634,0.9301412934300701,0.3565564928116798,0.1711034927691427,0.19499470330604024,0.10965022904660054,0.06054334674676176,0.6360302957213102
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,General Explanation,1.2298164509243517,0.27117603758521147,0.9553645942967446,0.2560573127964145,0.11470324232499395,0.3574608823503327,0.12850507738738326,0.08225122354485731,1.154833376030455,0.2546421774954406,0.8971151092851504,0.2404452556897524,0.10770967690036226,0.3356661534761439,0.08666259682499666,0.08750882663028553,0.654683806129094
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,GeneralExcerpt Language Translation,1.400601410173056,0.6246555985382676,0.8945818890948318,0.15017693582841274,0.1937805224915966,0.16649474222577665,0.10123116417321176,0.028279448412948438,1.463764034924532,0.6528255595870045,0.9349246588221463,0.15694943325356125,0.20251940161693557,0.17400312030530543,0.09278192729138068,0.0345753432918145,0.7933650156660107
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Human Decision Making,1.2784160293600761,0.17452602849269705,0.8750174098913553,0.36980908032387916,0.3038160086956453,0.34785079621897474,0.1024870295265431,0.07475938198366172,1.4723061303905425,0.2009954003714538,1.007726332582332,0.4258959239641862,0.3498940578344363,0.40060735157614225,0.11435120232888546,0.05711377752052921,0.6326473221989733
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Hypothetical Scenarios,1.039948606357665,0.14888769156368165,0.8657009652806777,0.02160934645370638,0.00823721668402111,0.12067522442090928,0.11338523416914559,0.04354957455433267,1.0604540271970129,0.15182341815117617,0.8827706190170133,0.022035433608771138,0.00839963585898624,0.12305466533414842,0.09017596382840587,0.011670449532134086,0.9590125152155509
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Idea Development,1.3314087900272786,0.47243869856479415,0.992897319576437,0.2220959548429191,0.20936259071886987,0.31393770320698733,0.11771281926724714,0.06881989303382413,1.2790792697309388,0.4538700285586299,0.9538726106920288,0.21336672392320333,0.20113383030930929,0.3015987359906001,0.10771495313875129,0.05714405258592459,0.7300191791439403
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Interpretative Analysis,0.9515358139491719,0.16302824687125106,0.8567401319921228,0.385230980448963,0.24382161384702483,0.28623469933568774,0.10620758646637363,0.07079536036922968,1.0034117737566606,0.171916243159145,0.9034480078294269,0.4062330557943241,0.2571143139794877,0.3018396818708379,0.09341827418047899,0.09682518447121571,0.534086813590945
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Literary and Cultural Translation,1.968471696195853,0.366693429383673,1.2279080193853191,0.25028172730199305,0.01729815503644433,0.16853045020315585,0.08989539995350626,0.01486199926754378,1.1250699678234324,0.20958176111702942,0.7018045718054995,0.14304724595564133,0.009886672370178106,0.09632272008470273,0.06605427779024697,0.011113413972151653,0.8316577449770487
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Logical Deduction,1.7951984110738506,0.5879141799539149,1.296793412323177,0.36838854179850955,0.10330842591209809,0.19220172262715923,0.11068854446398313,0.03555975452567528,1.3946568301084483,0.45673977958785317,1.0074550972080596,0.28619432413916457,0.08025842765751705,0.1493180049442178,0.11051822379929765,0.012792866042115936,0.8483321780111346
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,OptionBased,1.1976913058566694,0.39099321860153524,1.036852023048672,0.16843964060950378,0.18870090553645025,0.1776493064853886,0.10127210657224806,0.020563882019466462,1.5501204914841455,0.5060457542121557,1.3419531057002438,0.21800420292794853,0.2442274891729663,0.2299238784938657,0.1420416822790005,0.04619593235867059,0.8051917564939626
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Paraphrasing,1.4350947550373945,0.0,1.0284203165262782,0.0,0.013179546694433775,0.0,0.0813572650461909,0.0,2.936534998541741,0.0,2.1043852624296564,0.0,0.026968393548418637,0.0,0.1734149829007804,0.0,0.9999999999991219
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Personal Opinion and Advice,2.1644488098031887,0.0,1.4032308365173303,0.0,0.35955450825752144,0.0,0.08386858167268985,0.0,1.7827220456937245,0.0,1.1557540821139387,0.0,0.296142715686371,0.0,0.129349594991512,0.0,0.9999999999984437
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,PostQuality Assessment Rewriting,0.9978015143244238,0.3327835540344528,0.8389147569526387,0.21985741493851152,0.22364043297117311,0.007688068905086359,0.1168055002376277,0.020509997664196145,1.0788289700431106,0.3598075706362823,0.907039656890034,0.23771115307942503,0.24180137482116498,0.008312386120310156,0.0985976447132505,0.005418470153631505,0.9131478331514887
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Problem Solving,1.2742974210180658,0.6652925341794385,0.8903515989128594,0.7384054593026109,0.21361848600561412,0.8292474679276418,0.10559376319261571,0.04113631505512305,1.1643338719475091,0.6078821313787577,0.8135200679670593,0.674685888330617,0.19518460512285327,0.7576888246100993,0.0669902542414017,0.13062077064493605,0.3123262697965801
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Professional Content Generation,0.9680102473172141,0.4809161674020992,0.8156522468727643,0.15302107731988224,0.190554279290355,0.11518374663156183,0.1431207732812137,0.07103593408813541,1.0759806210564418,0.5345568168431024,0.9066288436393419,0.17008881286709976,0.21180841044197152,0.12803116452508223,0.1000957788812803,0.041847751289582846,0.7718295358600066
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Quality and Compliance Assessment,1.107081922332437,0.1982423481954414,1.117134377507937,0.27137052742358425,0.33662758848699603,0.1184786333051703,0.12335076156625746,0.023811089163086607,1.5434431000054665,0.27638043606248686,1.557457774317961,0.37833240670603585,0.4693108235631701,0.16517750437806522,0.16326448778227798,0.040229815674973635,0.7829789536044612
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Quality and Optimization,0.9467307708834929,0.5332224933456331,0.8034642170014071,0.2147987723776995,0.4014270264012954,0.6195073381107543,0.12949811081492824,0.09644689020607006,1.2992425355727935,0.7317659524600826,1.1026312005943892,0.29477831527699383,0.5508969219859119,0.8501786458486413,0.1514463903616381,0.10687489597588234,0.6785634783799427
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Specialized Summaries,0.9253140075050379,0.014277842252303219,0.656079054777311,0.04850805380590195,0.07990100183500476,0.0749586718245921,0.0918199861075436,0.0035031999236375677,1.2071342854148304,0.018626404404027186,0.8558992023688075,0.06328201496239949,0.10423622464561295,0.09778862312114205,0.07767858203717598,0.0022876330841519743,0.9587531104876369
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Specific Character,1.4069166096308054,0.40115245251182796,1.0588484424160023,0.11616000934966797,0.2561774388730565,0.3121905123244001,0.08798153145162879,0.08047802396105125,1.5000393452943028,0.42770442690768484,1.1289328119769575,0.12384855163518216,0.273133627860965,0.3328541638562885,0.11846897912452892,0.0220575060278832,0.8710459527686891
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Standard Summaries,0.876439855179846,0.7052430350969406,0.7159361626811976,0.22025783519398479,0.09239411380577012,0.07375741105817235,0.1079463967789196,0.08185914696308177,1.001383490348508,0.8057811701002954,0.8179986900587731,0.25165738239722035,0.10556564677389502,0.08427213035553509,0.08122503810935833,0.02367597418157391,0.7413013779418821
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Supportive Conversation,1.5737888909545998,0.0,1.0899134271882414,0.0,0.16117487311734638,0.0,0.0945554091439027,0.0,1.2818343536571533,0.0,0.8877229223766295,0.0,0.13127522407584927,0.0,0.0869772290759867,0.0,0.9999999999979481
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Technical and Practical Support,1.2531552315290782,0.5401898057908678,1.0176165966809856,0.29305662109292285,0.2574130213756597,0.15177929783713479,0.10671395752369262,0.05379376123236096,1.1783256751200692,0.5079334958565616,0.9568517396160158,0.27555735491474276,0.24204213856815618,0.1427161129710705,0.0959691106326317,0.03122854736255265,0.7367182894670262
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Technical and Scientific Translation,1.2399756848346444,0.08072472350340698,1.2507832182061793,0.020554906447626475,0.5282115198628536,0.32132009414919016,0.16869339536275918,0.044172220839336584,1.1603810551604967,0.07554296506137881,1.1704948478184685,0.019235477214702534,0.4943053708735619,0.30069440429618494,0.13069027118355991,0.019987803313536556,0.9707578330505567
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Textual ExpansionReduction,1.3826168204129434,0.19096614012455593,1.0614187591035162,0.12145509053752135,0.01626850295094169,0.013007938013516667,0.12485795259659865,0.04450179558795245,1.504162187486446,0.2077539075358601,1.154727715561465,0.13213216559141783,0.017698661425592677,0.01415146135084309,0.13203016180655602,0.02141968963207508,0.8744157475350123
rb_allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/Llama-3.1-Tulu-3-8B-RL-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-RL-RM-RB2,Tone Adjustment,2.019834174394343,0.43887204057740803,1.5514282798311008,0.07425888975907435,0.031747605969664694,0.016851972466059854,0.10677380994165425,0.03530000844175801,1.5751500474284774,0.3422505294217151,1.2098677998111427,0.05791014688671958,0.024758093353803596,0.013141863607262237,0.12287507710210671,0.004013046501236184,0.9619176698968224
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Analytical Reasoning,1.0772870537120816,0.7391971529894643,1.0102146609336184,0.4188757056750321,0.27877235539540646,0.455880470392719,0.16887604346625612,0.13231017331579487,1.0933231553224636,0.7502005718225401,1.0252523474027986,0.42511093643843856,0.28292205885826927,0.46266654056799483,0.12091161969504727,0.07308022633578945,0.6114918548826928
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Casual Conversation,0.689008477349899,0.3252411171754688,0.4935856513235918,0.32825893918491256,0.10166424039809399,0.11260279790928134,0.1000860284311143,0.10040878740011164,1.1342805439843449,0.5354283487698237,0.8125656207302003,0.5403964397339374,0.16736480564947318,0.18537241132061902,0.10217818084280272,0.10084302194393371,0.45174896917805907
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Content Categorization,0.8781007472675564,1.052795945166813,0.6350216914633935,0.30381754120022875,0.2708901595417568,0.2451523771624925,0.11694970855919185,0.10548098998036581,1.0158301650690755,1.2179261685982747,0.7346243488221849,0.35147108573666985,0.31337906992925546,0.28360433644429056,0.071193549043015,0.04281999685271587,0.5868790158722165
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,ContextBased,1.1990589365939757,0.6801410070145432,0.969541368553887,0.305440674785171,0.1917464787255191,0.1969292235757186,0.12370351368301041,0.07721642813458646,1.0752040464368138,0.6098869209606457,0.8693941313829017,0.27389066496454706,0.17194033047393104,0.17658773191894525,0.08029846902099935,0.04048043631292858,0.6785789123175894
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Creative Writing,0.9976607894762326,0.42057441528297723,0.9346456721145007,0.29824721586842506,0.16182630670962433,0.2367875978892316,0.12134226842481177,0.05000910749431581,1.1783638564151673,0.49675169669902086,1.1039350149791836,0.35226776792572,0.1911373814178746,0.27967616843649246,0.11001073684327728,0.0407612724369249,0.7097142543169116
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Critical Thinking,1.002647484812215,0.8470947625575365,0.9275163957627446,0.306076299705909,0.40150940511652317,0.41759551910356335,0.16767758184327008,0.11266567706950814,1.0738444198767862,0.9072460637046615,0.9933783518347862,0.3278104532993299,0.4300201623636224,0.4472485342531906,0.11568905262359208,0.08389477352289121,0.6353476466389828
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Data Management,0.7465565501385352,0.37906927610460195,0.8441450831142878,0.36626741038991595,0.27286070850516925,0.4664168750542303,0.2088277720191371,0.28237959391592243,0.9299796835207601,0.47220364667456827,1.051544959446883,0.4562564621999159,0.33990046071365854,0.5810118707969066,0.1330089659325856,0.13937018680664987,0.5413614008602433
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Development and Implementation,0.9508501977739456,0.43442561586250406,0.8403721074204524,0.27016628441234003,0.20911948183152246,0.5540057657136642,0.15370897812694184,0.09897429267293803,1.1341767667352691,0.5181840857155195,1.0023981925649548,0.32225509723090573,0.24943830092299943,0.6608196218298539,0.12960191749701327,0.09163633590506318,0.6427501244732013
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Discussion,1.1344731889360093,0.38467930835758224,0.7287143687079097,0.12863494242534934,0.2576995460723839,0.11614174298643018,0.10287381266448592,0.037740644363705556,1.2954373859500783,0.4392593518364224,0.8321076655752359,0.14688625097752994,0.2942631254579264,0.13262045978815407,0.08036276882562715,0.024764556123361647,0.7989440244969934
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Documentation,1.1017178393298988,0.6297914702351072,0.9620099393524547,0.5426315426153278,0.6008967879858863,0.42346695070883306,0.22947687771700964,0.019257269357300988,1.7578017124468288,1.0048385215737463,1.534896375838981,0.8657739947435574,0.9587367701768726,0.6756457094072058,0.3576689942141968,0.28698665082686914,0.5155934534778222
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Factual,1.1562698733884489,0.6583644302045875,0.8661299974422005,0.37733331630350064,0.3114271667890981,0.45740865122148777,0.15913377605297485,0.18902985451657067,1.0993740542217156,0.6259687201469012,0.8235109023299654,0.35876612137428077,0.2961029728677298,0.43490124139947806,0.10508109049949232,0.07454673335278683,0.5828595549740958
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,General Character,0.7372266040260519,0.8526243642405815,0.7954114188266752,0.24633649388653844,0.2950193305223171,0.23099659685389715,0.1579096051244231,0.22615039241237966,0.9552506646110124,1.1047756363599377,1.0306427933066282,0.31918693413666754,0.38226701263290347,0.2993104853548797,0.13835971818280718,0.09011082858035588,0.6571923255492076
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,General Explanation,1.198737214314235,0.6298517931625585,0.9906622128006211,0.2645294419693812,0.23871793156767643,0.22713592949700745,0.1328722458423096,0.06681320972193411,1.0089943068592968,0.5301552883128793,0.8338545936511099,0.22265822546976122,0.2009323069483624,0.19118356968402134,0.08014718606040527,0.06089874596632272,0.6735644059126314
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,GeneralExcerpt Language Translation,1.3511531443414406,0.4986293183132784,0.901464551158019,0.1985387126889755,0.18885097820785182,0.17340830878029323,0.10954341164082976,0.02514975835741673,1.4665765296980746,0.5412251441075784,0.978472913022411,0.2154990479691623,0.2049837306822477,0.1882218583266295,0.10332962910170218,0.05016090371607099,0.738593182871846
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Human Decision Making,1.3340214329452431,0.30816972870672243,0.9333791699108336,0.5292409698123662,0.270568437262016,0.2856893844098338,0.1290805236934695,0.06778011080603269,1.201880504585175,0.27764410667539496,0.8409236913270575,0.4768172295709606,0.2437673952384257,0.2573905433670892,0.0823817376206209,0.08450385383449247,0.48175405947285344
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Hypothetical Scenarios,1.4287686443289096,0.33298255953173195,0.8754778170146695,0.11363610731469886,0.16214802898936512,0.26381226938745905,0.10641226729061837,0.05855534470472795,1.4977067314177797,0.349048968029138,0.9177196216856274,0.11911904949263308,0.16997167138810199,0.27654121138540405,0.1106508405470853,0.03261341711241783,0.8350798942889981
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Idea Development,1.2537517241499123,0.6796383159524481,0.9609330633883223,0.3491432393596494,0.20654570359359603,0.2733030766398128,0.11752951595690886,0.08101892057528301,1.2473557905707873,0.6761711849065404,0.9560309253261918,0.3473621076367328,0.2054920240047332,0.2719088386168549,0.10877743031144349,0.06705458473928816,0.6330568247676382
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Interpretative Analysis,1.2136168697522471,0.07656990257831131,0.8961082582030486,0.3681853164282147,0.36933717714244274,0.14943999893960336,0.13575326466129328,0.10469526305521029,1.4441360797624787,0.0911138940906544,1.0663186210556854,0.43811989825198694,0.4394905479666845,0.1778252260718859,0.14419956771469228,0.14852142067335028,0.5557633268358803
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Literary and Cultural Translation,1.8878663375190368,0.18812710307843505,1.2108732936444575,0.15237571474223843,0.041180451806822886,0.04182389636630449,0.08567623744959674,0.011217358761959784,1.3906928238259524,0.13858344047924787,0.8919873015198895,0.1122472543770564,0.030335494452917847,0.03080948655374469,0.08895319224501319,0.005321805425873349,0.9221037991588011
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Logical Deduction,1.9442280594011287,0.8400570876882061,1.5505896792257643,0.356273912075467,0.18273825489277656,0.06563134506712397,0.10439670146312019,0.02041070969957548,1.4026407868322321,0.6060494440255764,1.118654942359537,0.2570296822664202,0.13183439483231454,0.047348972791887614,0.11137544503687336,0.07220122226478787,0.7122826538984696
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,OptionBased,1.0780913594114336,0.3690154548627018,0.9478117095763935,0.10074375720883555,0.2116932600694489,0.2277793740564891,0.1165628032630186,0.03443544559465739,1.3900482995117285,0.47579391212770883,1.2220708789218384,0.12989501044792373,0.27294890512644504,0.293689703388333,0.11540243230295022,0.04752815325113258,0.8445322761295179
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Paraphrasing,1.3357909054838173,0.0,1.0309232985094345,0.0,0.06048378859127111,0.0,0.09013508835874229,0.0,2.006078533916063,0.0,1.5482311571096201,0.0,0.09083399912722057,0.0,0.14284899505616588,0.0,0.9999999999988173
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Personal Opinion and Advice,2.2446563457515882,0.0,1.4867233383322183,0.0,0.2946976082425763,0.0,0.08637469185608593,0.0,1.5696871343789875,0.0,1.039664936228651,0.0,0.20608189982673825,0.0,0.10855100055701472,0.0,0.9999999999982582
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,PostQuality Assessment Rewriting,0.9398513173353069,0.3709457885411468,0.6980837266575873,0.24866898041465935,0.22134492846167303,0.15764391707299388,0.10713773897960777,0.015032409130061763,1.1311126789859192,0.4464338953366902,0.8401449672431871,0.29927354617642565,0.26638900259465986,0.1897247256851502,0.09240133916855198,0.02107780764234979,0.75720631805173
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Problem Solving,1.4960086007947377,1.261955642283303,0.9841484537271188,1.3274618731305277,0.2782897719757953,0.7026414589539156,0.10397337462157807,0.06485372362492181,1.2470703902847586,1.051964216320853,0.8203845857808767,1.1065701061702393,0.23198191131103574,0.5857208026627769,0.06908376094785396,0.15513547609444056,0.18537920806055078
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Professional Content Generation,0.9439331687595184,0.7254535793517868,0.8069286296048963,0.2660479878861266,0.1769472538574421,0.13202678054863234,0.1301367477972839,0.157841155148832,0.9693636484468922,0.744998005932413,0.8286680734590571,0.2732155800162941,0.18171438535984685,0.135583710712813,0.090187710012127,0.035880152823013295,0.697272398622081
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Quality and Compliance Assessment,1.0287069894712202,0.4533066921547926,0.8734313058463182,0.3300490779115952,0.050188675639565396,0.02380744870081948,0.11143939434798689,0.04367500193359197,1.328007514425131,0.5851954926739671,1.12755463836626,0.42607628807983966,0.06479098428469757,0.03073418485299756,0.10469692167037137,0.054640971535850236,0.6450945454530432
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Quality and Optimization,0.779693944951838,0.8190244936501513,0.8145918755737224,0.3433871474258494,0.30949683311065324,0.5868214382472261,0.1884719004557629,0.18583382210258875,1.04499175681095,1.0977048751398704,1.0917640192682174,0.4602276839241126,0.41480588819976644,0.7864926612020517,0.17188237633084025,0.1832872162222779,0.5464962413910428
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Specialized Summaries,0.84351560219542,0.0743580619050932,0.6503627055872839,0.02889803352296827,0.13158441241398874,0.10906385283213248,0.1111217124768834,0.02286286325841791,1.1289955853500586,0.09952385398389785,0.8704718935527833,0.038678303267673964,0.176117928737325,0.1459754959460958,0.0827744943735732,0.006822326717718652,0.9438960433613603
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Specific Character,1.3010448992718107,0.4698753895614438,0.9997877312145189,0.15062858402864576,0.304992721194282,0.3053144434740228,0.11321290601901124,0.07229923053805365,1.7792828645292569,0.6425921423454453,1.367289614149536,0.2059966251898604,0.41710191779765965,0.41754189872360653,0.17576375000750166,0.014782470326796582,0.8976747050170937
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Standard Summaries,0.9716817153871626,0.4291373058892647,0.796593301368223,0.23721572311044914,0.18080792121433173,0.16729558546521794,0.1172632289111668,0.039336961881892285,0.9639087214345026,0.4257044104969613,0.7902209318811656,0.23531810956888033,0.17936154339746394,0.16595729994071395,0.07204680019342258,0.03956559896203643,0.6796552471612891
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Supportive Conversation,1.7968189323523893,0.0,1.038680380143185,0.0,0.01930333678444823,0.0,0.06886861943702338,0.0,1.4247469333119458,0.0,0.8235981153442465,0.0,0.015306144314900042,0.0,0.07536730830510785,0.0,0.9999999999977752
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Technical and Practical Support,1.4255514215315017,0.3463038726772498,1.1183870749489693,0.2971797849980766,0.15700047251351226,0.0936211834045739,0.1240971347236921,0.05006197308376292,1.2591157218350808,0.3058722709222619,0.9878133667409612,0.2624835090835609,0.13867038416960492,0.0826907413798259,0.1120848802947827,0.044916008966535304,0.7264755942175529
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Technical and Scientific Translation,1.2297029837392874,0.05863388548276127,1.3019807424657441,0.14432176610903946,0.5472495978391072,0.40569179475315365,0.1782837769347535,0.02600212999079632,1.2359760855927386,0.05893299538211161,1.308622555909088,0.14505799684966858,0.5500412902330429,0.4077613562515386,0.1534451542669641,0.008426446521184128,0.929187718379731
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Textual ExpansionReduction,1.2759907767369953,0.1670140784704448,0.938848169393615,0.21013380068570542,0.07463956889986649,0.12362178599040387,0.10931575327724691,0.04323213303122814,1.3954693079964064,0.18265259025210612,1.026758052756241,0.2298098660081278,0.08162851131902456,0.13519722187213443,0.11377074754506045,0.028681335791298557,0.7888981091606756
rb_allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/Llama-3.1-Tulu-3-8B-SFT-RM-RB2,core_eval_runs,Llama-3.1-Tulu-3-8B-SFT-RM-RB2,Tone Adjustment,1.8330126888232297,0.6004946351362102,1.4478776072360144,0.001432110981346213,0.12482824453943188,0.10166424039809399,0.10968929089735291,0.03761146172204699,2.1080148667679106,0.6905853003637217,1.6650989597204084,0.0016469669075107607,0.14355590492425616,0.11691666483521895,0.17107851597705542,0.01635781710778278,0.9980439502327854
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Analytical Reasoning,0.8298415723469295,0.27327842520322176,0.7456893347097092,0.21475992543608036,0.13970396840857396,0.18664450179385483,0.13864096528383713,0.07829501613929932,1.0466235922912683,0.3446677734799255,0.9404880114886303,0.27086230930130517,0.176199257961493,0.23540220863655464,0.09267967980008096,0.04991557910838673,0.6870774040140861
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Casual Conversation,0.5621687688761017,0.16988002558482596,0.577970839969427,0.10465379500117833,0.08941053978148734,0.14138041602947687,0.1301336082724983,0.26581739652476927,0.9583865876178741,0.2896118515266737,0.9853259941286702,0.17841402622376035,0.15242729027719668,0.24102565275081722,0.08944134825561179,0.04263458056002023,0.7691670125663468
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Content Categorization,0.8315180199678324,0.2218499018328155,0.7273609501876621,0.12952110137790462,0.09835159375963608,0.18329160655204907,0.12664109668122703,0.033657974959171,0.6822856900755769,0.1820345557526909,0.5968216634102708,0.10627598189199927,0.08070045796592845,0.1503963080274121,0.0525040739963919,0.013839560827691666,0.8085211701876448
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,ContextBased,0.7376369531972706,0.6345354245117429,0.5735934489592918,0.2019032796767024,0.14193923190311117,0.22240871770644977,0.13124449219686046,0.25998981556819306,1.0804949059576525,0.9294711861476626,0.8402030253566324,0.2957491002060434,0.20791341372215433,0.32578558528117096,0.0846077930962888,0.04823169949016118,0.6471084084462119
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Creative Writing,0.9497075772414859,0.7119314230100932,0.8471973650055765,0.29524299356968076,0.16093897160667725,0.1975414113297236,0.12564590496545752,0.07502341605642004,0.8246741336190457,0.6182023219950952,0.7356598701882956,0.2563728728335287,0.1397506034023449,0.1715341607733643,0.0668814003001178,0.05570902595110014,0.6117736282312864
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Critical Thinking,1.0206771931930414,0.20564424149742078,0.7547739733220556,0.07283233553033663,0.06594027308884692,0.29058425428983387,0.09355604057514122,0.034613838427627774,1.1354445826580657,0.22876737279943493,0.8396425675573467,0.0810217778664667,0.07335475540851451,0.3232582441731148,0.07610517383815962,0.024362233901811026,0.8621814090129075
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Data Management,0.6689026007402523,0.3474437694321235,0.6524718617890882,0.22404732883590872,0.18161515893114616,0.16457127478530015,0.1391039257852874,0.02880330167057668,0.7570233840539,0.3932157803137112,0.7384280704917294,0.25356317478194257,0.20554101906225358,0.18625178496564204,0.06968637024582308,0.04759235570749651,0.6331867771331751
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Development and Implementation,0.985899636557332,0.41694649371538905,0.8864527248787148,0.2036835351074644,0.2103941764233124,0.44984677827560826,0.14632626528871873,0.1671966474522442,0.9296725479099218,0.3931675139956834,0.8358972179129758,0.19206720849586678,0.19839513354917038,0.4241914542020775,0.0821596918020906,0.03533448949616039,0.7411735398963498
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Discussion,0.896899477183045,0.23805556216821,0.6601478187199816,0.09412942938106572,0.08829290803421876,0.14724798270263695,0.09642621026679477,0.021730745807953478,1.4728675768162383,0.39092933814561825,1.0840794791768948,0.15457717178632147,0.14499257142490074,0.24180723145861605,0.11447515012181303,0.02574350000854206,0.8398599637695155
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Documentation,1.1985901969613604,0.3895645159073087,0.7320759591214515,0.15681382387283516,0.09471929058101315,0.05224928418480665,0.09640571909229834,0.029966698994319474,0.8839522601282286,0.28730122703742234,0.5399011274174548,0.115649147125647,0.06985484379823288,0.03853349790639985,0.03944205971559778,0.012864896808157944,0.772143206929284
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Factual,0.9633985661455262,0.3867005845549327,0.7331043355555702,0.20170148505566776,0.12070422870500792,0.18105634305751186,0.10577368031291734,0.03640235279995202,1.3942710663520985,0.5596494071436495,1.0609795359898961,0.2919109022328428,0.17468825425293114,0.2620323813793967,0.11115026761714597,0.05202371291830882,0.7071226593783912
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,General Character,1.1450137250766723,0.5297574482053126,0.8997866925301554,0.14856352840515097,0.14193923190311117,0.28639313523757665,0.11219657991395177,0.049354742166920235,0.9712098145941979,0.4493445115838455,0.7632062810146856,0.12601277497190233,0.12039399361001771,0.24292095167375619,0.07359046529221419,0.018448403697173543,0.8194748548202886
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,General Explanation,1.0561620011688193,0.5937418657364393,0.7084233011367221,0.2491542663251134,0.15423318112306567,0.19111502878292924,0.09803067283407751,0.03490158362326745,1.6817777512015368,0.9454438416146203,1.1280566285910603,0.3967403686733668,0.24559294144530378,0.30432168831265904,0.1289082339951585,0.06784211104239168,0.6559740779652701
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,GeneralExcerpt Language Translation,1.3350111221123329,0.6426382546794405,1.1549437944502214,0.19818449308906438,0.2727021463335364,0.28611372730075946,0.11834691886853843,0.03266294249143542,1.6750087571617316,0.8063039224512314,1.44908228679981,0.2486576748009366,0.342153316657566,0.35898052895220034,0.1690101467614688,0.05342712890496176,0.8005596803252051
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Human Decision Making,0.9231638232438568,0.339760051169652,0.6198509851656862,0.2789034502645614,0.13467462554586532,0.1335569937985967,0.10688574761934372,0.020318372352429837,1.4642033306903226,0.5388835502782787,0.9831276466425537,0.44236066290318743,0.21360351252806759,0.21183086927057332,0.0992433738132878,0.06920169297929224,0.5787078884198175
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Hypothetical Scenarios,0.47164059734734576,0.5873154831896449,0.7065916268842541,0.22455084522204782,0.2995253082679826,0.23805556216821006,0.2569566317788826,0.29090041706403175,0.5856689310681547,0.7293104816974295,0.8774239646597682,0.2788404014228516,0.3719414064603447,0.29561014767184113,0.11120454522595524,0.06277000534351718,0.6658823337555217
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Idea Development,1.1422196457085008,0.7144460944414475,0.8663819214173498,0.26220106470815874,0.14529212714491693,0.19390910815110068,0.10285090452836199,0.056265658381393385,1.1393876961736864,0.7126747404882867,0.8642338670667282,0.26155097942375816,0.14493189873050805,0.19342834176725499,0.08689215302482306,0.0434820319793687,0.6857989804820186
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Interpretative Analysis,1.3579225729313391,0.2098353605496781,0.9264856731593496,0.0926004470601498,0.14193923190311114,0.0519698762479895,0.08342784615476484,0.019092491351079127,2.013528925710229,0.3111440788494615,1.3737938667117138,0.13730803390860058,0.21046763256394985,0.07706098357656432,0.17490534519930934,0.02452350831054989,0.889848680511837
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Literary and Cultural Translation,0.7795481437198428,0.5247281053426038,0.9668445973662709,0.19074248486717293,0.11511606996866496,0.05252869212162381,0.20747436129247532,0.12837556412254536,0.5333983046881238,0.35904014917716714,0.6615540980845176,0.13051370704913712,0.07876706148082688,0.03594225135532876,0.05416534044173582,0.011162835507127322,0.8174318347624127
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Logical Deduction,0.985751201090898,0.5894110427157736,0.6510204927839548,0.14209445853467628,0.18105634305751186,0.13188054617769385,0.11272989004119682,0.03232592909437171,1.31615232269548,0.7869680625640915,0.8692275827325591,0.18972125160548048,0.24174226335223103,0.17608387083681026,0.09240586596996259,0.05506356314674221,0.7265126993203697
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,OptionBased,0.438111644929288,0.16764476209028872,0.5075600398915058,0.11492979801078679,0.3956416385330815,0.45711138463285406,0.22599596759845664,0.5413565064158078,0.5560470614402215,0.21277311024498274,0.6441902924805968,0.14586778780128262,0.5021445401781592,0.5801613472679862,0.058396458594650036,0.02375938524286636,0.7475563481031156
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Paraphrasing,1.3692036683803315,0.0,0.8514743437884181,0.0,0.033528952418057756,0.0,0.11952221304476707,0.0,4.060513090258133,0.0,2.525133987598577,0.0,0.09943352719556686,0.0,0.47180907693662044,0.0,0.9999999999993326
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Personal Opinion and Advice,1.440627322229215,0.0,0.9523153846517792,0.0,0.053646323868892405,0.0,0.08738181632182362,0.0,1.3542345622512948,0.0,0.8952061287186118,0.0,0.050429215661801516,0.0,0.0879122693630614,0.0,0.9999999999979656
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,PostQuality Assessment Rewriting,0.8144741358219862,0.08287937925838651,0.6270127538795202,0.1380702081113514,0.16764476209028878,0.10170448900144186,0.13729492760459683,0.05013487049644494,0.7074515713222929,0.07198896135282173,0.544622767576913,0.1199276703643789,0.14561610387422838,0.08834043635036522,0.045957434741793146,0.021056601033609068,0.7427118081833658
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Problem Solving,1.0718088456305797,0.21905582246464395,0.8264265864524977,0.08447433289771777,0.15423318112306567,0.16764476209028878,0.12237190038606005,0.00799208563721654,1.2535593766241373,0.2562019163903346,0.9665667537288418,0.09879895330188504,0.18038706358094989,0.19607289519668467,0.11142287692048525,0.025772185638321177,0.8678141776408599
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Professional Content Generation,1.190417514809459,0.46661125448463714,0.9125152763184922,0.21779096795900055,0.14417449539764834,0.25537885425087326,0.11094037904186588,0.07163222355019477,1.0742683666355268,0.42108395077604266,0.8234810755262953,0.19654108286094474,0.1301073764074479,0.2304615155744329,0.07325930768573136,0.025590316736926078,0.7530829510907463
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Quality and Compliance Assessment,0.9186932962547825,0.2585920455242705,0.7231271438117245,0.19215116654862618,0.2313497716845985,0.1324393620513281,0.2015476234340492,0.03149557017978982,1.3045444806817912,0.3672007046444641,1.0268405442126487,0.27285465649904894,0.3285166757921299,0.18806389411288593,0.16459988072509402,0.07359090586456662,0.7165324765421148
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Quality and Optimization,0.715843134125533,0.3925244937379653,0.570629590462846,0.13578328325349653,0.09835159375963608,0.29477537334209114,0.12236635107965299,0.08173249840070179,1.0950857577813924,0.6004779010668239,0.8729403240667366,0.20771916715712357,0.1504567473610656,0.45094280814183013,0.10740478691021016,0.0472990269735617,0.7342087228965546
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Specialized Summaries,0.7029903690319442,0.11511606996866486,0.623234925733805,0.12713061125180236,0.19055621290929492,0.13243936205132814,0.17872032544953975,0.05231890286725416,1.2052689991783478,0.19736519382411744,1.0685291979555436,0.21796398832517805,0.3267064616214758,0.22706578124425147,0.13371979417198326,0.0002832155912158574,0.9958163807417397
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Specific Character,1.1735133346320215,0.15842430017532294,0.76120035586885,0.07171470378306788,0.109527911232322,0.2425260891572844,0.09378429187582893,0.0873194957868415,0.912648579188692,0.12320755819047347,0.5919902252150142,0.055772968728197925,0.08518053405761125,0.18861403969899634,0.04835041434410137,0.014414498938872977,0.8666001279891852
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Standard Summaries,1.1645722806538727,0.18943858116202628,0.8617251224703972,0.1604112010593559,0.07376369531972705,0.05141106037435522,0.10253324786480589,0.036899502842748844,1.0584629086967052,0.1721779875471129,0.7832095051368747,0.145795421444761,0.06704275621303507,0.046726769481812314,0.06961812566677017,0.01799065815208223,0.8054050806187636
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Supportive Conversation,1.0852204265978027,0.0,0.6983335700849918,0.0,0.15646844461760287,0.0,0.07760798350871967,0.0,1.903867812359808,0.0,1.2251288068206179,0.0,0.27450205327535854,0.0,0.12787329967886696,0.0,0.9999999999985218
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Technical and Practical Support,1.1846896521047074,0.4928756005454489,0.9415116110948495,0.3012483238783551,0.16093897160667722,0.2302321399373299,0.10792710968726948,0.04128129992392726,1.1876415366199184,0.4941036958956453,0.9438575702587072,0.30199894262556015,0.16133998233327196,0.23080580806009743,0.08370628790631107,0.043059324791245523,0.673999184418282
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Technical and Scientific Translation,1.5120160500859963,0.09932952153849617,1.0765665418880492,0.03926457645438752,0.41799427347845336,0.3263484702024288,0.11837707881416693,0.018936594429642684,1.2929531508705614,0.08493852815937997,0.9205921474026748,0.03357587232054249,0.35743470640346664,0.279066669170621,0.10824709598734417,0.005036110406713923,0.959935491284097
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Textual ExpansionReduction,1.3613191256632726,0.4165579420532526,0.8493855754274205,0.22053144563095972,0.1234983080731794,0.15563022080715141,0.09916907038594536,0.022663503101155502,1.658871463579576,0.5076077092965936,1.03504128171721,0.2687344319803817,0.15049213310886173,0.18964732611229856,0.1090376633747634,0.08021711143894392,0.6812571358008815
rb_allenai_tulu-v2.5-13b-uf-rm/tulu-v2.5-13b-uf-rm,core_eval_runs,tulu-v2.5-13b-uf-rm,Tone Adjustment,1.7500716122542064,0.900811188298485,1.0743312783935122,0.47555230846278573,0.1352334414194996,0.012293949219954498,0.10825980021832199,0.02048400595455907,1.875132211265714,0.9651834037072982,1.151103287191273,0.5095354072921283,0.14489726035804354,0.013172478214367567,0.16963755567230315,0.10596494655329919,0.5956320004767612
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Analytical Reasoning,0.881113397003559,0.56238592701627,1.000089027815545,0.46008721986469614,0.40749375191761766,0.5648294932042396,0.23033503032662195,0.1165854781116735,0.9456214874316655,0.6035593359768348,1.0733075643420324,0.4937711339724373,0.43732719207070614,0.6061818005780584,0.1519277325029182,0.11238057876754257,0.5670669788147666
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Casual Conversation,1.2272354049348893,0.4498603274189855,1.0586413433922057,0.33711978449961133,0.5528111370552458,0.5459625025692353,0.15465388904754196,0.11621769339875243,1.3176414000486365,0.4829999112339318,1.1366276235574233,0.3619541801401417,0.5935347347892055,0.5861815861623967,0.15550785275122075,0.12541571611632352,0.6334337392089532
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Content Categorization,1.1779651315937856,0.5952327759103392,1.012196039427141,0.48747944907273233,0.6130525238933564,0.8827424411288719,0.1622996717056171,0.05486603353869557,1.0578327102414449,0.5345291500384631,0.908969247870832,0.4377648310369864,0.5505315866124475,0.7927177162545395,0.14846244729456803,0.05787328081871823,0.6499396234128137
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,ContextBased,1.1973806779036402,0.6067025763844828,1.2301434888071177,0.4265731462651061,0.341234875555391,0.5240867671721731,0.1671313443556633,0.17038293935272736,1.059695480427022,0.5369386612148136,1.0886909396666726,0.37752207256266235,0.30199673509281855,0.4638227330478558,0.1374274137395473,0.08613477629883032,0.6398640547178633
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Creative Writing,1.1582836188961245,0.7987967999775313,1.090299654990766,0.4929206780899348,0.38944128334526995,0.46831694022905457,0.14866770561173237,0.132878454612283,1.1018031723633055,0.7598457182081843,1.0371342554600833,0.468884786060043,0.37045127327967164,0.4454807803529191,0.14419837841158817,0.12417101372664363,0.5527187302027916
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Critical Thinking,1.0274281560954603,0.40799243889475445,1.118580747560828,0.1920942235930485,0.2579873961720415,1.1189870850977544,0.22227239921769892,0.2579879162625799,0.9426764847532877,0.37433749096855895,1.0263099767779522,0.17624853510563343,0.23670623614048383,1.0266827958784028,0.14433605269130367,0.11979457712755914,0.752331993724826
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Data Management,1.0850763706524602,0.4842749234974282,1.0832330758258586,0.43724412408142865,0.5223579856514325,0.4982880275549689,0.32105639354141907,0.22030055714970123,1.1729555311059845,0.5234958252299564,1.1709629498271004,0.47265605228921714,0.5646631933555127,0.5386438354132145,0.19632670735870283,0.10177640389254672,0.6351570758080431
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Development and Implementation,0.9743678617281177,0.7772867683637024,1.0965868048802965,0.1047538170195701,0.2322135909036942,0.277436188280372,0.20394413953872742,0.198305114608207,0.957565160760514,0.7638826756715168,1.077676472457443,0.10294736677451244,0.22820913254476594,0.2726518788913092,0.11841787748357074,0.05933135016259783,0.8517514800058785
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Discussion,1.1348785767691765,0.57230979786129,1.2542069824427666,0.13209542719069067,0.7002894990937993,0.832175581647213,0.19585347183331214,0.27135636903069527,1.0691129534497146,0.5391447427103386,1.1815263400723834,0.12444056588246344,0.6597080868134753,0.7839514394715201,0.16306743084097042,0.12562055531527866,0.8308479572576368
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Documentation,1.9939500093832945,0.40772647250694805,1.3708055386637348,0.32040084184390305,0.4579941198023253,0.3792680690116817,0.09914699006429001,0.011377565392131694,1.5008257694727416,0.3068915438977875,1.031791302530602,0.24116243523518244,0.34472748766600786,0.28547119478032934,0.12502215800494332,0.053193148766153886,0.7395488752868898
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Factual,1.3109483254968997,0.6920777868702822,1.1524619101852551,0.33885687746997073,0.45919096854745334,0.5157088259562768,0.15955126300428857,0.09696082780293491,1.2093724821608358,0.6384537168072153,1.0631660255474016,0.3126013245776086,0.42361160285061533,0.4757503028829095,0.13126973009032628,0.0759036876199673,0.6771734437848621
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,General Character,1.3270392919591765,0.21267337284955534,1.0674228516618218,0.2897833084365151,0.6121216415360347,0.38880961317423013,0.1937292603370136,0.05536109631218172,1.2610291876003643,0.20209449125863133,1.0143266892044513,0.2753687944524601,0.5816732488750853,0.3694692942425658,0.162663982124521,0.05721267939385899,0.7360669833522878
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,General Explanation,1.0322155510759723,0.4811331955414673,1.024831289836741,0.31441844510706707,0.18537857230094118,0.6355266836629827,0.17561772008022958,0.1462943558917913,1.0429078237166,0.4861170453757613,1.0354470721219027,0.31767536923965045,0.18729882842836132,0.6421098285933563,0.13852387640994746,0.10343920707188359,0.6473224362962301
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,GeneralExcerpt Language Translation,0.6889859276120346,0.46750241816639804,0.9535116641509791,0.07762893944094285,0.4585260525779377,0.5407761580070138,0.3197199692051468,0.15374395024715382,0.9200991639937441,0.6243212914533888,1.273357335590776,0.1036687679948558,0.6123338964389943,0.7221739529782976,0.20512421075806347,0.05332439717271184,0.8833843256285546
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Human Decision Making,1.6360257429930622,0.5416114586937177,1.2244519928069448,0.36883766148710695,0.422753573418,0.5236047030942742,0.10952370986134752,0.06539895236813592,1.3627726340057171,0.4511501590564193,1.019940960341078,0.3072334733228127,0.35214421487536474,0.4361509367866906,0.13973044129438644,0.0652472366124063,0.6933564021064245
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Hypothetical Scenarios,1.1282294170740208,0.28318771141668186,1.0452183522576102,0.31528468285624156,0.32275021160285816,0.1808571437082353,0.13261355052184975,0.016711408646078618,1.342651244345245,0.33700799444049956,1.2438637922690283,0.3752050472658044,0.38408941183709444,0.2152293366701477,0.14789524568195012,0.029472465754377508,0.7866265590288443
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Idea Development,1.162306360511694,0.531442400084939,1.0191979739839008,0.24157020531222118,0.5018785737903529,0.7696734805127492,0.18094552499398142,0.19071656535996323,1.2564402942274746,0.5744833446783484,1.1017417144173796,0.26113471469385874,0.5425251761007396,0.8320085023786583,0.15000038457143583,0.11020451255130875,0.6987854232056832
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Interpretative Analysis,1.1809572534566057,0.4520431218751608,1.2630106545780333,0.43379625773117025,0.3792348232132059,0.419611845462039,0.1875245452819269,0.1731219035643222,1.1647388618714962,0.4458350966968846,1.2456654024004832,0.4278388214603426,0.37402669324272725,0.41384920739572917,0.1475321414299447,0.017646761494163193,0.837417418188675
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Literary and Cultural Translation,1.2432598798002146,0.18002599874634084,1.2214506360001038,0.23766497835931988,0.8324747938334949,0.34841596802615915,0.14106143250679803,0.055098926106482315,1.2692862540955492,0.1837946589455397,1.2470204560774008,0.24264025165830727,0.8499018024000576,0.35570970004603053,0.1916693683881736,0.05053345086718247,0.7993903682475855
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Logical Deduction,0.9394597733285502,0.44496176679982014,0.879469576967812,0.08621281990850838,0.547358826105218,0.20730417639571713,0.1931567078791001,0.2562012340555532,0.9732312030887269,0.460957124429879,0.9110844964012955,0.08931197356535026,0.567035123775667,0.21475628731190305,0.09760847805801254,0.01592602651880437,0.8847295001571721
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,OptionBased,0.8619970628799862,0.1247382358811211,1.0777844588534393,0.23990537577882642,0.6542773140033218,0.28937142993317644,0.26735393606553715,0.07198179420752226,0.8547505577570459,0.12368960555015562,1.0687239052403998,0.23788857594334456,0.6487770355082793,0.28693878643618204,0.16298132901782691,0.07369231869835546,0.7420636256284479
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Paraphrasing,0.7532168102672387,0.0,0.7906626612837907,0.0,0.9407231136706298,0.0,0.33283428714316654,0.0,1.3513674077380387,0.0,1.4185500594380567,0.0,1.6877777264016394,0.0,0.3338474556706822,0.0,0.9999999999988587
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Personal Opinion and Advice,1.2420630310550866,0.0,1.2933354402599537,0.0,0.19362353032293428,0.0,0.24803483641690205,0.0,1.3204241654813245,0.0,1.3749313252983622,0.0,0.2058391418566176,0.0,0.24002411715624916,0.0,0.9999999999987615
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,PostQuality Assessment Rewriting,1.3808974854899378,0.4988199603305814,1.4886434243723274,0.21100000099294114,0.2606470600501038,0.41065210277281666,0.13498309197733194,0.10413182499234197,1.2261750781521297,0.4429297686969029,1.32184864293005,0.18735854429905419,0.2314429076635376,0.3646406626862265,0.17635919830154467,0.03765935057681802,0.8433413373452088
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Problem Solving,0.7532168102672387,0.10971113497006924,0.8548750740509528,0.3610124316708708,0.3525384470371557,0.25160420286469204,0.33342304145302515,0.22408491695854138,0.9725854535833592,0.14166366511410156,1.1038509102816783,0.4661545451717557,0.4552125772333131,0.32488200532833966,0.1946911628541097,0.10978489593719742,0.6169733884969596
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Professional Content Generation,1.1248050998310157,0.20635667113915734,1.083713292914953,0.3593981634559913,0.2886732681651851,0.24542048434819724,0.15508598633858428,0.17975360722074765,1.1299010157884606,0.20729156755235922,1.0886230429360817,0.3610264125068774,0.28998109892977875,0.24653235889665168,0.12407586690580424,0.11364283862526614,0.6144959211939558
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Quality and Compliance Assessment,0.7771537851697994,0.15665420241786854,1.0323115944937913,0.1352032744457745,0.4421691197278547,0.13451250063300008,0.406191948080057,0.1425693930854693,0.8666939310424852,0.17470319143874868,1.1512498696729352,0.15078078451824295,0.49311384680291975,0.15001042286951977,0.17998888691161685,0.03975009405035812,0.8483779291894867
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Quality and Optimization,1.0889993748726021,0.3130424384479309,0.9829161876612023,0.22345978746614137,0.5199642881611765,0.43851208189551905,0.17986881224412943,0.1453403541555212,1.1497512134375183,0.3305060882197972,1.0377499799786882,0.23592590383602596,0.5489714549445227,0.4629752871303232,0.11884817737838771,0.04292801466290824,0.7565028357501773
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Specialized Summaries,0.7151836168109481,0.5407096664100622,1.040246258396677,0.3768522076557751,1.2469834092295018,0.6057384482286853,0.7556054202262005,0.22429203734875114,1.1113392476915223,0.8402204129999498,1.6164610976066012,0.5855987736611798,1.937714415322314,0.9412700396494762,0.5070557937875476,0.28153280051278773,0.6450287422722156
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Specific Character,1.2399352999526367,0.28677825765206566,1.1239517910034704,0.26066552993814596,0.2968184887917509,0.27082027438369205,0.15551708631861205,0.07803274589559461,0.9299056397101543,0.2150730922388404,0.842922295426527,0.19548951173240814,0.22260289444799058,0.20310519468787314,0.09076442497689408,0.036926304256996,0.7442141598581469
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Standard Summaries,1.1080159716007474,0.7518703554289697,0.9224896401954694,0.4046512361479384,0.34721911928103116,0.34243172430051894,0.2092333828326396,0.22345691144629493,1.201851370393283,0.8155445771462221,1.0006132282017581,0.4389202458815964,0.37662433126462574,0.37143150249192314,0.17554944856214139,0.15160180569440573,0.5588991095198625
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Supportive Conversation,0.560125212719917,0.0,1.0580142906931764,0.0,0.2912331946478201,0.0,0.4613067101537768,0.0,1.2120115524383186,0.0,2.289355154605713,0.0,0.6301769467806073,0.0,0.40866863571751766,0.0,0.9999999999992587
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Technical and Practical Support,0.9859373995976887,0.370928568250273,1.1639982022563502,0.4394586636576765,0.3138403376113495,0.16435060476501126,0.20650039975980095,0.2399455987004056,1.006982378324502,0.3788460930659254,1.188843914990779,0.4488389734336963,0.32053930575206696,0.16785869258319314,0.1281368334349987,0.15358422069572852,0.563147867834581
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Technical and Scientific Translation,1.2780682308043547,0.08660530502940311,1.070309695162802,0.07959598251742639,0.27288151388919035,0.18192100925946023,0.14190860153476276,0.0011156780715446124,1.132488131837821,0.07674041004701859,0.9483946145817141,0.07052949394276509,0.241798574151987,0.161199049434658,0.10679057118660246,0.012694387519577022,0.9125665642670839
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Textual ExpansionReduction,1.098640656430578,0.36168104161799475,1.2014403592952936,0.08021195328363051,0.27022185001112803,0.28498298453437376,0.25657887769295995,0.11614997853347991,1.0266799464607814,0.3379910166902753,1.1227462924635152,0.07495808881705668,0.2525223810698188,0.2663166646814083,0.15847013030470036,0.018812604838424535,0.9180873182900131
rb_nicolinho_QRM-Gemma-2-27B/QRM-Gemma-2-27B,core_eval_runs,QRM-Gemma-2-27B,Tone Adjustment,1.2227222877918025,0.3531618057585766,1.4049392110432515,0.2828650160870321,0.4554674391181661,0.3876460102275779,0.18968351513421777,0.008030743236836613,1.3863076883156416,0.40041056863920876,1.5929030241223585,0.32070892178236843,0.5164034539415086,0.4395083410918241,0.2028179530846373,0.06918847719462523,0.7762958144281812
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Analytical Reasoning,0.8296121436833079,0.7204526510933988,0.9129180722387646,0.33660702013250876,0.2746223024104024,0.4831743856216495,0.223817301578971,0.20448460487773426,0.6064537090188067,0.5266571683584375,0.6673510689964569,0.24606266602463578,0.20075129702977118,0.3532046878703715,0.062020865962772065,0.062118793115128024,0.583330183088245
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Casual Conversation,0.39067607874283194,0.3510339472233387,0.5559473806815889,0.18154947188637482,0.24474707285948002,0.511900567882152,0.3529969601241031,0.5036414229513488,0.4728388829320461,0.4248596433404121,0.6728682731135931,0.21973101030371556,0.2962196531309583,0.6195580069006664,0.09458738968942931,0.0580784738085936,0.652881265339337
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Content Categorization,0.7767559683239835,0.3860798895811516,0.7965451161034407,0.2765373812277693,0.5446484156591246,0.34011799796434783,0.13596652428224731,0.06776635166649136,0.8635525632692772,0.429221392394197,0.8855529983953024,0.30743833859981173,0.6055087499846707,0.37812360758536406,0.09997826644458585,0.055420941813532965,0.6524983584514079
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,ContextBased,0.8905116500755729,0.7721597791623032,0.7955237407341784,0.2919856836878616,0.20223232311393655,0.3998684570661927,0.14826445770989322,0.11643128039360026,0.9752431690193307,0.8456302059109552,0.8712172309906021,0.31976790359099794,0.22147457773858348,0.4379156423467446,0.1174411562237036,0.0915639199636488,0.6070726178601967
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Creative Writing,0.7526259752251616,0.40446464622787304,0.778862555023087,0.3455121366332642,0.29645420092838426,0.3987194097757726,0.200322672202339,0.21850984536208773,0.7617093921675342,0.4093461160961407,0.7882626203423312,0.3496821106960797,0.30003209645683027,0.40353154058341123,0.08812707715228324,0.08410694515174105,0.5456997424360315
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Critical Thinking,1.627050963234853,0.9238340214977554,1.1899023051905862,0.5755450205793092,0.27577134970082257,0.43663797035963575,0.12502783851415744,0.017444454005732357,1.311921587113817,0.7449046299714046,0.9594404576193325,0.46407270203304185,0.22235959103624017,0.35206935247404697,0.11390926062915996,0.10961988122090649,0.5252948764607511
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Data Management,1.0192049466026234,0.6072714929870198,0.7448379880345364,0.49717041997857214,0.29760324821880435,0.2861127753146034,0.13622851230234595,0.08378781513040504,1.0350749198196194,0.6167272774798972,0.7564358113776348,0.5049118279021138,0.30223720883120786,0.2905678185288446,0.07580713266359695,0.06691347961594005,0.46759956020596566
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Development and Implementation,0.7537750225155816,0.1735061408534342,0.7803946180769805,0.3204884400863379,0.37573846396737076,0.6107186348582799,0.22136214176788016,0.1995324463359265,1.0148301335244185,0.23359657036918768,1.0506689009430907,0.43148328974926664,0.5058680696074463,0.8222289877564457,0.14819899061746328,0.14900242583644616,0.5581359477482809
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Discussion,0.7491788333539012,0.8370809510710384,0.9383886205097434,0.3295531464885406,0.2608337349253613,0.46249153439408786,0.21080574232578114,0.1793325578876863,0.8673924559738464,0.9691647303327408,1.0864578308465866,0.38155364297588457,0.3019909317577655,0.5354685023458177,0.12678782634535457,0.0588827651605216,0.6703576850945029
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Documentation,0.9203868796264952,0.4906431930093802,0.8937034481045174,0.04430215664175241,0.3722913220961105,0.1792513773055347,0.3956828532925313,0.2975722899185535,1.2448550653012735,0.6636118762592308,1.208764801929382,0.05992019803849935,0.5035368803465825,0.2424436831298361,0.19652857322308065,0.02395927182128249,0.9319434501428936
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Factual,0.9330263998211163,0.598653638308869,0.7044936609486754,0.37044007923932254,0.3125408629942656,0.342416092545188,0.12595885642677018,0.1387244679036187,1.2116957891482198,0.7774550568303232,0.9149065906567975,0.48108036839266,0.405888244640783,0.44468638567262253,0.11015593065003604,0.0939180665090163,0.5217984608557004
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,General Character,0.6515098136681934,0.4601934398132477,0.688406998882794,0.19977463863164913,0.41135892997039364,0.26140825857057143,0.2574953426716289,0.19251467613011228,0.6841435789324319,0.483244274007829,0.7228889236926657,0.20978123949334182,0.431963670648696,0.2745020532753584,0.09298138414794405,0.052251352128117434,0.6819884112567102
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,General Explanation,0.8962568865276733,0.6319760097310517,0.7936086619168116,0.22629848025217958,0.2665789713774618,0.38837798416199165,0.19530902813159878,0.20405833334942342,0.9600415919557377,0.6769524045841739,0.8500881104838839,0.24240366406574698,0.2855508324791425,0.4160180231808197,0.100317931936311,0.07777688485720213,0.6590542554523027
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,GeneralExcerpt Language Translation,1.3960924578604144,0.29760324821880424,0.9743282663156607,0.20805735451676055,0.12869329652705053,0.25853564034452114,0.11701580856052252,0.024418102154954546,1.2844925905392839,0.27381364687216037,0.8964430914220891,0.19142581050366536,0.1184059013501233,0.23786899824801558,0.09943349791235268,0.03898814827556679,0.7584361972640569
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Human Decision Making,1.1789225199710165,0.3039230083161151,1.024886347094145,0.43976593242800155,0.11490472904200941,0.3315001432861971,0.14641293336293232,0.08593280241491488,1.0745768932033954,0.277022990499316,0.9341743567457501,0.40084255020999526,0.10473458998083776,0.3021592920947169,0.10650469762028125,0.07934315885960044,0.5801126310297076
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Hypothetical Scenarios,1.4156262617975557,0.7595202589676819,0.9726046953800306,0.4202959644514389,0.20223232311393655,0.6607021919915539,0.15948744486089628,0.07777808596942382,1.001171826114657,0.5371546891735295,0.6878541640820216,0.297245459196379,0.14302454658780814,0.46726769481812314,0.059736782028842805,0.08563566680629858,0.5147461308912371
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Idea Development,0.8342083328449882,0.6359976752475219,0.8348466924507771,0.3102108504331358,0.31024276841342535,0.46134248710366776,0.16733694934792093,0.14797907537719432,0.9471572367118146,0.7221095461707843,0.9478820279856986,0.35221231954400545,0.3522485591076996,0.5238066536360793,0.128929750904408,0.10569832541818294,0.5996705736207854
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Interpretative Analysis,0.6905774215424765,0.05228165171411414,0.6714266333688083,0.11531966278577221,0.22061707976065803,0.06377212461831522,0.19257275696742315,0.03878539267323744,0.8498300764055213,0.06433821709892051,0.8262629639150743,0.1419132957133088,0.2714931358899503,0.07847848459318874,0.09518129113953772,0.01443306909236125,0.851233482668209
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Literary and Cultural Translation,1.3145101002405875,0.2642808767966214,0.8829790067272634,0.06485733594815635,0.19763613395225615,0.04136570245512339,0.1044087202363293,0.01830419390440763,1.2526256961708198,0.2518390822721053,0.8414101898521162,0.06180398734020742,0.1883318354382701,0.03941829113824258,0.09169575051034373,0.01355762286062906,0.9065159655769661
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Logical Deduction,1.539723369162926,0.9583054402103585,1.1420253347564155,0.6091227358438077,0.2298094580840188,0.11490472904200941,0.12479514772893319,0.07214777227041858,1.5912282001736577,0.9903614320483811,1.1802268864472112,0.6294983203174561,0.2374967462945758,0.11874837314728787,0.16489756973357994,0.08761729227460036,0.587221375422849
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,OptionBased,0.5653312668866862,0.7790540629048237,0.8431453673260333,0.5580539673806921,0.0965199723952879,0.013788567485041134,0.12579818557776512,0.050715737056218235,0.7277400299615895,1.002861260800727,1.0853647601324157,0.7183721027466367,0.12424829779832015,0.01774975682833145,0.08905285111427796,0.10647538154541203,0.4265346917730988
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Paraphrasing,1.3903472214083137,0.0,1.229097584986027,0.0,0.500984618623161,0.0,0.17344952720394435,0.0,1.3335519917756213,0.0,1.1788893503052253,0.0,0.4805195606728685,0.0,0.17011870768949278,0.0,0.9999999999985174
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Personal Opinion and Advice,1.5604062203904876,0.0,1.2253950992724512,0.0,0.2068285122756169,0.0,0.12021607819371882,0.0,1.1116000623412818,0.0,0.872945295234137,0.0,0.1473402144487708,0.0,0.08824956424162989,0.0,0.9999999999979192
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,PostQuality Assessment Rewriting,1.3926453159891539,0.8824683190426321,1.2506741396616934,0.6891730304097409,0.3768875112577908,0.3309256196409871,0.1411786094154609,0.007217753533012128,1.1513842260361034,0.7295900046169368,1.0340080563118508,0.5697810828880534,0.31159573113848343,0.27359625173135127,0.14841853484173384,0.13657428273684102,0.5019254938113517
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Problem Solving,0.5975045910184489,0.5952064964376087,0.7957790845764939,0.4546397112428839,0.18384756646721503,0.2114247014372973,0.27849379516697426,0.09424760980252855,0.8471864611783322,0.8439280517122617,1.128314789000971,0.6446220060376157,0.26067275728564065,0.2997736708784868,0.24421841817426015,0.18474336424065607,0.5215344614394039
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Professional Content Generation,0.9514111564678378,0.453299156070727,0.8702756505720634,0.19785955981428216,0.18614566104805524,0.464215105329718,0.15379178235960567,0.1589156843301088,0.9231047088373898,0.43981256960911874,0.8443831518773575,0.1919728291090641,0.18060744303340237,0.45040374682404044,0.09424405368528582,0.05662484268326043,0.7190108813245959
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Quality and Compliance Assessment,1.5259348016778849,0.6790869486382756,1.160410091403137,0.49728213290958523,0.12869329652705053,0.293007059057124,0.19962755705613033,0.11626053725068258,1.6405196494229275,0.7300806572356553,1.2475471129187572,0.5346238315346639,0.1383570788669939,0.3150094206346736,0.18617575551286603,0.1143070269155197,0.6036481374645412
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Quality and Optimization,1.3512796135340306,0.7135583673508783,1.216585736712564,0.35039558761754974,0.44812844326383666,0.47570557823391896,0.1402276906578186,0.03667017028327951,1.0806554947472424,0.570652263807855,0.9729370946189034,0.2802209944570868,0.3583806487682182,0.3804348425385701,0.11976539181271795,0.06575036541260676,0.6853560294085569
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Specialized Summaries,0.7916935830994447,0.5090279496561017,0.6028668117070759,0.2696430974852486,0.268877065958302,0.025279040389242058,0.22095386744452783,0.14308053573141655,1.7670119136097875,1.1361194161525918,1.3455620474222572,0.6018269894442623,0.6001172536787958,0.05642128026039961,0.13726605986399532,0.04913370597365918,0.6718968218288517
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Specific Character,0.5423503210782844,0.08502949949108696,0.6028668117070759,0.03364155122507728,0.5929084018567685,0.41595511913207395,0.2872689998394734,0.13792675923343883,0.6981571019143704,0.1094568337747106,0.776058812438714,0.043306119669123455,0.7632395436182524,0.5354509976546654,0.093185121036374,0.034819526951107516,0.9074621095638735
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Standard Summaries,0.7813521574856639,0.782501204776084,0.7713937476353563,0.28145275019234406,0.2114247014372973,0.2240642216319183,0.12011442946380335,0.24730632616988774,0.9360279927430302,0.9374045044970638,0.9240982242080699,0.33716890686307344,0.2532781627422317,0.2684197920366042,0.1222994546192061,0.0700052194592039,0.6404048755293354
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Supportive Conversation,0.6595531447011339,0.0,0.8490182756992916,0.0,0.2527904038924207,0.0,0.24599621483540524,0.0,0.8237397785948205,0.0,1.0603696336925195,0.0,0.3157190789039382,0.0,0.11749039634443048,0.0,0.999999999998302
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Technical and Practical Support,0.8089292924557462,0.4228494028745946,0.8031840560036457,0.3658438900776422,0.3630989437727497,0.20108327582351643,0.1664696650973898,0.12590206251998892,0.7305253764177277,0.3818655376729032,0.7253369859602155,0.33038517457780914,0.32790627691477553,0.18159366601292948,0.06675643673173781,0.04504649179561082,0.5805016272026592
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Technical and Scientific Translation,1.1812206145518567,0.24819421473074055,1.4972086194173824,0.06217622560384295,0.5055808077848414,0.353906565449389,0.28059246976253754,0.12831391047868537,1.0569763730822253,0.22208842080326918,1.3397278347530541,0.0556363482156339,0.452402338673326,0.3166816370713281,0.1672324217591915,0.024669118472906093,0.937244980483901
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Textual ExpansionReduction,0.598653638308869,0.6859812323807961,0.5645013993991606,0.28790018221081237,0.21602089059897767,0.17925137730553464,0.20695610776111384,0.11721773955606088,0.6412585505242813,0.7348010646122761,0.6046757690644293,0.30838942875501263,0.23139464011240857,0.19200831839114754,0.05787117050877322,0.05887680382683125,0.5069116814928706
rb_nicolinho_QRM-Llama3.1-8B-v2/QRM-Llama3.1-8B-v2,core_eval_runs,QRM-Llama3.1-8B-v2,Tone Adjustment,1.1352587229350528,0.08043331032940637,0.9442615288830016,0.05885675565374038,0.14707805317377204,0.08273140491024675,0.16910980067654968,0.05170645029018994,1.1469825651684613,0.08126394692489103,0.9540129388198314,0.05946457068630928,0.14859693151980066,0.08358577397988787,0.12488507738642318,0.02395239977492436,0.9102052089040655
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Analytical Reasoning,1.128179169352546,0.5217669076081382,0.8460840694556356,0.4248926931194328,0.14631087519590902,0.20319635345795484,0.11007841949326302,0.06307249415139549,1.2783029610648948,0.5911970377576901,0.9586702189472884,0.4814320300392655,0.16578007295265146,0.2302351500178463,0.10107044459205844,0.07080210213020971,0.5571066120668084
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Casual Conversation,0.6309656492823577,0.24084290170235678,0.5226757425223805,0.10697366309107459,0.05534161350754351,0.07161157284345221,0.09882890549713097,0.09001325686342102,1.067173735884202,0.40734581900492406,0.8840193209219358,0.18092820712500246,0.09360115959383365,0.12111909707099075,0.08883052515590606,0.010565212307795835,0.8603256267965749
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Content Categorization,1.0163380437715823,0.3242412881522976,0.7597661951686552,0.12145399287715952,0.2403678663932791,0.34250045784496885,0.09209756426246585,0.04816189877071342,1.1698418036379798,0.37321343589420364,0.8745183372266052,0.1397979333324877,0.2766720975184939,0.39423039982378283,0.08723154800615984,0.02338003385638443,0.8185121487003912
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,ContextBased,1.075836216233555,0.4362902416809826,0.8032055350987537,0.19447912565074899,0.16816249941347985,0.16840001706801866,0.10939373118112267,0.03737353641569746,1.2809668323057601,0.5194780770706795,0.9563534248623422,0.23156062770092856,0.20022618771883832,0.20050899306872366,0.11862658055299274,0.04003598357018906,0.7542996074484931
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Creative Writing,1.3988602264063412,0.4594779027053346,1.015971870720835,0.2556679619731541,0.2011774533943749,0.31447337460938884,0.10668868190166675,0.051626883852076866,1.3049978081266123,0.42864729770284926,0.9478009592245373,0.23851284330264932,0.18767860488721294,0.29337245911531273,0.10024012986731062,0.051504196836008376,0.7133137803849705
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Critical Thinking,1.3980289146154552,0.4410702844785763,1.0215370413209317,0.31872230154058323,0.34297549315404646,0.37527789417132507,0.1267902901326623,0.049147502776082264,1.431616898004684,0.45166710492604456,1.0460797162374995,0.32637968205539647,0.35121556247345626,0.3842940365014272,0.12236051802555747,0.02890910401148583,0.7642203283619519
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Data Management,1.1761874252762037,0.26762301725160786,0.8982765122553442,0.1593380587761003,0.24345559590228366,0.2103218830941192,0.12363098346551704,0.07307159197423152,1.3438254841794808,0.30576643059354414,1.0263048584498549,0.182047979243384,0.27815450752907267,0.24029835747999403,0.11174517218529123,0.06330335629434006,0.7632513840550295
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Development and Implementation,1.3414997128352177,0.3209754204023887,0.867686630021572,0.12115709580898604,0.23609254861158047,0.13324740419627432,0.091301978936427,0.019064966008203676,1.5810432909416101,0.37829008096604433,1.0226242405493917,0.14279139357691162,0.2782501825771885,0.15704059600181364,0.11020777068515941,0.03477058193882504,0.823998124701742
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Discussion,1.3134726295996377,0.2574691375200737,0.8605627498135642,0.26323883721157904,0.39451682418896894,0.2681574319743203,0.11352299920240994,0.05790567080235448,1.617490849915686,0.3170633058424239,1.059749813002198,0.3241684683311281,0.4858322426238615,0.3302255279484284,0.12591609958101674,0.052942492834726906,0.7017778968110922
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Documentation,1.0969752874875094,0.038359101208018664,0.8763659209811777,0.15298611094534365,0.142510592723288,0.134910027778046,0.12899159768100105,0.03886998718689855,1.197093120431417,0.0418600279202499,0.9563493607863578,0.16694872074437095,0.15551713159225944,0.14722288457400562,0.10845339689458233,0.03794933238605758,0.7922092611611505
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Factual,1.128683894368441,0.3998015920024576,0.8358642125978406,0.22482035658992605,0.21946631279386353,0.37076505873508764,0.10109564214908473,0.04816034350016268,1.1554362253182338,0.4092777833053276,0.8556760626260699,0.23014910152351908,0.2246681549229899,0.3795530193017178,0.08548581592857929,0.0546162887361033,0.6848455115158728
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,General Character,1.6660675877625062,0.7363937981907736,1.1003632129210004,0.12130142077268169,0.20901553599415573,0.4173185190246951,0.09660897562968462,0.10617802861549375,1.2711486921946742,0.5618415605621303,0.839536924742722,0.09254855175691101,0.15947121664143038,0.31839878140794675,0.08635036956466347,0.018717401096906283,0.8640297590251651
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,General Explanation,1.41109238561509,0.4212375603245855,0.9932683168604217,0.42378757625456465,0.28122090297395497,0.5543662056935903,0.12575061707113266,0.09556200401628817,1.2938810468357935,0.38624777589013304,0.9107632233788772,0.38858597664478545,0.25786149797232444,0.5083181894150383,0.11536675545937547,0.10506663336458849,0.5592909435796468
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,GeneralExcerpt Language Translation,1.402660508878962,0.7874081369296879,1.0374573218876584,0.33724679353386033,0.14108548679605512,0.2944031328008591,0.09430198071352602,0.019799525076882374,1.2309039493476617,0.6909895725729547,0.9104200957448942,0.29595073607471556,0.12380949046016612,0.25835330542992574,0.08765845014626966,0.04886956656217073,0.6632008174544161
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Human Decision Making,1.1796314312670164,0.4338705305753685,0.8472122783146949,0.05650610978604631,0.09833230897906872,0.4795481445138642,0.11826788133656913,0.08639174707849978,1.2877879315484901,0.47365068301831914,0.9248903670712754,0.06168696790578565,0.10734807282010972,0.5235163261444482,0.09987713841719909,0.035472148755275795,0.8937734984048297
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Hypothetical Scenarios,1.5747717392991498,0.21631920387122427,1.0680278233405915,0.18638538168670782,0.32397408079094137,0.23609254861158047,0.09648081463125724,0.015096639204699391,1.3974323023865507,0.19195889510356912,0.9477542318913932,0.16539600410759847,0.2874904561309561,0.20950550835349732,0.1063038886306642,0.03538954616676926,0.7953377920582713
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Idea Development,1.3957724968973366,0.4893160580567728,1.0845550934689172,0.2425088241404414,0.3220739395546309,0.4961743803315811,0.12802864499855404,0.08181214276734683,1.2657336573732114,0.4437283333439952,0.9835112012384984,0.21991519506716228,0.2920675298899132,0.4499476916961863,0.12803500559963898,0.07649050408851113,0.7222320339795153
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Interpretative Analysis,1.3657265135981769,0.11567109776040208,1.1715261413058795,0.42408282389458174,0.43893262558772705,0.7944965544323306,0.14450868403013967,0.10092329158661181,1.130376527831659,0.09573797722678568,0.969641900254553,0.3510023897273795,0.3632931866839837,0.6575842583646783,0.1331515264469681,0.12129535087052826,0.5956506124818646
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Literary and Cultural Translation,2.244541835391786,0.08289366143404564,1.3706879934929874,0.15488295332534108,0.03135233039912336,0.0071255296361643974,0.0822609713195348,0.006004102949579648,1.27580818195633,0.0471171487304507,0.7791055303286051,0.08803620230296982,0.017820812700342387,0.004050184704623271,0.0786784052001499,0.008574698259357139,0.8949805684568981
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Logical Deduction,1.2271349621747791,0.5208762164036178,0.9191537367894511,0.23066922883294427,0.24321807824774486,0.2346674426843476,0.11790545089714832,0.018328915509487897,1.186098199059166,0.5034575342178942,0.8884162096827243,0.22295539230032446,0.235084594181087,0.2268199014169081,0.08297816863975016,0.04844207410394641,0.7039699465149075
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,OptionBased,0.9144429699744313,0.5439154288938826,0.6965601082108266,0.2989324625186618,0.13586009839620122,0.008550635563397274,0.13541306100814154,0.06991399084022398,1.3115098100933795,0.7800928480815167,0.9990184683291388,0.42873407092796456,0.1948528860710164,0.0122634683541199,0.10408681969056577,0.05736803793852163,0.6172003084613639
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Paraphrasing,1.9352938491822511,0.0,1.0927725445446976,0.0,0.0570042370893152,0.0,0.06432149080351623,0.0,1.814977116319451,0.0,1.0248351497261576,0.0,0.05346029797700887,0.0,0.10064941414145556,0.0,0.999999999998223
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Personal Opinion and Advice,1.3064658587907427,0.0,1.050329459221137,0.0,0.3296745044998729,0.0,0.12177364021379034,0.0,1.2156082642532804,0.0,0.9772847581334284,0.0,0.30674743583011604,0.0,0.10336928798455769,0.0,0.9999999999981493
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,PostQuality Assessment Rewriting,1.3708331431707612,0.6002071130195814,1.1970098063241061,0.35132159962603526,0.3819283884984118,0.3087729509004573,0.12135232661155704,0.019684606425242612,1.4176341654476217,0.6206985248351626,1.2378764011321488,0.36331591862273627,0.39496764065490353,0.31931463485781997,0.15029153607721568,0.07568501296024488,0.6901433166744277
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Problem Solving,1.7804323384229448,0.39546689480712427,1.1480481809259802,0.17713538858494626,0.2567565845564572,0.17861327621318762,0.1241422451913361,0.019761076655949505,1.0691441600838565,0.23747665775608617,0.6893994125152363,0.10636925771497174,0.15418154176235976,0.10725672470425028,0.06553260243199976,0.02469738714131753,0.8033649120742735
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Professional Content Generation,1.3448249599987612,0.5929479797027387,0.9603696475645613,0.2641410744131951,0.32706181029994597,0.41102430117941646,0.10911009383508324,0.035967587810172424,1.315524175045292,0.5800289443199631,0.9394453002657468,0.2583860200016779,0.3199358511192806,0.4020689835598512,0.11159858593202304,0.051736683598087974,0.7080428946665804
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Quality and Compliance Assessment,1.0367645620619201,0.37432782355316985,0.8414772166144765,0.20843009208785113,0.08123103785227416,0.07030522574348874,0.10235319728207304,0.02061760796493506,1.2767355442824537,0.4609702675347416,1.036246715473096,0.2566736140531116,0.10003288800563555,0.086578172075053,0.11196320669367565,0.03489654921221447,0.7587927953542235
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Quality and Optimization,0.8581512858487326,0.5334646520941748,0.8027997757722499,0.16695841685922053,0.4341822724969508,0.3800282472621014,0.16253832473829333,0.13652663174063318,1.1772633400912746,0.7318387660794361,1.101328822825723,0.2290435576212424,0.5956372503976889,0.5213455145712813,0.17928727023327784,0.09694427443168874,0.7405216677037895
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Specialized Summaries,1.153504489267747,0.3390564518541559,0.8259247585266538,0.1718407242025185,0.21281581846677675,0.12445925097833815,0.09006152912955634,0.005228048055749723,1.9626509127609855,0.5768936843336367,1.4052845015135205,0.29238148385506424,0.36209929328401996,0.2117634259830653,0.14712924848704534,0.04883248179763411,0.7743143589161962
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Specific Character,1.3595510545801675,0.2536688550474526,1.0774592535395702,0.1828885939948861,0.4218313544609325,0.49569934502250346,0.12149036959780946,0.02203007243506111,2.642563386203991,0.49305690015126924,2.094260722271728,0.3554810984236676,0.8199148451953683,0.963492275857395,0.26003191738110343,0.09621889377948362,0.7923719817900312
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Standard Summaries,1.2778449814188158,0.32005503949105085,0.8754240975038051,0.19111099335513626,0.2650697024653157,0.16816249941347983,0.08715411570558396,0.031863945862409426,1.2284225748084234,0.3076764720361247,0.8415658703068487,0.18371951366968453,0.25481776830598524,0.1616585841941196,0.07636739442441498,0.02819252730121674,0.7600538358719012
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Supportive Conversation,1.5376892954842776,0.0,0.9993291406215528,0.0,0.057954307707470454,0.0,0.09199505887283621,0.0,1.0792518973928515,0.0,0.7013951871181549,0.0,0.04067616048252329,0.0,0.06226767538465894,0.0,0.9999999999973811
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Technical and Practical Support,1.5609660256290812,0.542430943553015,1.1639684614927068,0.30528059604680646,0.23751765453881335,0.27362033802871294,0.11599271611146045,0.02416228133391618,1.4185094169979988,0.4929277055796071,1.0577425751791563,0.27742013164228685,0.21584135985970765,0.24864924655838322,0.12595758010624158,0.0576674542170349,0.7164121949393317
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Technical and Scientific Translation,1.5176190536757477,0.3155422040548135,1.0456351866876823,0.16594896682743054,0.4655346028960742,0.25651906690191845,0.09771605308762182,0.0037607356424725347,1.334672873234966,0.27750417279797346,0.919585791699085,0.14594412466652085,0.4094152643462764,0.22559616606835642,0.10120534332761555,0.036896606294216094,0.8016020604315622
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Textual ExpansionReduction,1.5104935240395836,0.5889844038426224,0.9233201923128196,0.36340201144438444,0.10522032096069431,0.10795177398789067,0.0872313432588755,0.058369674896995405,1.0945200267648398,0.4267845013554701,0.669047847951092,0.2633250477160476,0.07624378832562687,0.07822302888035533,0.058414392941055326,0.0772932206044642,0.54514820071845
rb_weqweasdas_RM-Mistral-7B/RM-Mistral-7B,core_eval_runs,RM-Mistral-7B,Tone Adjustment,1.5476650369749076,0.698301904344111,1.2175517845854005,0.12723358913760419,0.06840508450717823,0.03610268348989962,0.1381173228357428,0.06662207844610851,1.9009436802549717,0.8577001872236982,1.4954769378358095,0.15627663700411376,0.08401961017701533,0.04434368314898032,0.16210114764935152,0.015327250721970997,0.9054911782551871
