dataset,measure,metric,metric_class,kendall_correlation,p_value
CoGymLessonOutcome,outcomeRating,FastTextToxicity,FastTextToxicity,0.6998542122237652,0.0415400670098851
CoGymLessonProcess,agentRating,PRM_mean,MathProcessRewardModel,0.5087055749715826,0.0620279043610827
CoGymLessonProcess,communicationRating,FKGL,FKGL,0.7416198487095662,0.0347402884985901
CoGymTabularOutcome,outcomeRating,LENS_SALSA,LENS_SALSA,-0.629940788348712,0.0179911062383214
CoGymTabularProcess,agentRating,Toxicity,Toxicity,-0.3661961453350343,0.0734579322240253
CoGymTabularProcess,communicationRating,Sentiment,Sentiment,-0.7597217761636014,0.0037413939048458
CoGymTravelOutcome,outcomeRating,INFORMRewardModel,INFORMRewardModel,0.4737766343280913,0.0013682375505865
CoGymTravelProcess,agentRating,GRMRewardModel,GRMRewardModel,0.3025007742525794,0.0043358155848771
CoGymTravelProcess,communicationRating,LDLReward27B,LDLRewardModel,-0.3113114554058924,0.0343845357273399
EvalGenMedical,grade,INFORMRewardModel,INFORMRewardModel,0.298820816149246,0.004189466058075
EvalGenProduct,grade,PRM_min,MathProcessRewardModel,0.3836748095807357,4.089119896782945e-05
HelpSteer,coherence,INFORMRewardModel,INFORMRewardModel,0.2713085461481928,0.0
HelpSteer,complexity,distinct_1,DistinctNGram,-0.2773431942266416,0.0
HelpSteer,correctness,INFORMRewardModel,INFORMRewardModel,0.4195007654099293,0.0
HelpSteer,helpfulness,INFORMRewardModel,INFORMRewardModel,0.4150247755783487,0.0
HelpSteer,verbosity,distinct_1,DistinctNGram,-0.4076081893510244,0.0
HelpSteer2,coherence,INFORMRewardModel,INFORMRewardModel,0.2243176061982336,0.0
HelpSteer2,complexity,distinct_1,DistinctNGram,-0.2633683075450573,0.0
HelpSteer2,correctness,INFORMRewardModel,INFORMRewardModel,0.3982823283775037,0.0
HelpSteer2,helpfulness,INFORMRewardModel,INFORMRewardModel,0.40369087779015,0.0
HelpSteer2,verbosity,distinct_1,DistinctNGram,-0.3872695748554819,0.0
Primock57,inc_plus_omi,LevenshteinDistance_min,LevenshteinDistance,0.5854158563515489,7.459552340431464e-155
Primock57,incorrect,ROUGE-2-f1,ROUGE,-0.5888500353304446,1.7162025592885737e-146
Primock57,omissions,BERTScoreR_roberta-large,BERTScore,-0.5402497567247847,5.582054242231644e-131
Primock57,time_sec,LevenshteinDistance_min,LevenshteinDistance,0.5038793653946702,3.966368138184756e-121
RealHumanEval,accepted,GRMRewardModel,GRMRewardModel,0.1814313004467367,6.000164082600277e-41
SimpDA,fluency,LENS,LENS,0.6067546756144107,1.0210287403519992e-55
SimpDA,meaning,ParaScore,ParaScore,0.5682948296205032,4.2509676697589866e-49
SimpDA,simplicity,LENS,LENS,0.5588338835236037,1.552404885869797e-47
SimpEval,score,LENS,LENS,0.2924719045124436,6.436926925308576e-12
SummEval,coherence,UniEvalSum-relevance,UniEvalSum,0.2641251083915238,6.361479465708496e-38
SummEval,consistency,PRM_mean,MathProcessRewardModel,0.1747311974561024,8.711073341257703e-17
SummEval,fluency,LENS,LENS,0.1647352420216822,3.7201003866144304e-15
SummEval,relevance,PRM_max,MathProcessRewardModel,0.2217658748291919,5.458488565423915e-24
