Dataset: cogymtraveloutcome_outcomerating
  sensitivity:
    Autometrics (Autometrics_Regression_outcomeRating): avg=0.362994
    BEST_METRIC (INFORMRewardModel): avg=0.277729
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.270503
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.529762
    MetaMetrics (metametrics_score): avg=0.214721
  stability:
    Autometrics (Autometrics_Regression_outcomeRating): avg=0.872425
    BEST_METRIC (INFORMRewardModel): avg=0.928659
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.913360
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.908730
    MetaMetrics (metametrics_score): avg=0.807696
  binary:
    Autometrics (Autometrics_Regression_outcomeRating): avg=0.809524
    BEST_METRIC (INFORMRewardModel): avg=0.952381
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.841270
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.888889
    MetaMetrics (metametrics_score): avg=0.761905
  aggregated:
    Autometrics (Autometrics_Regression_outcomeRating): avg=0.617709
    BEST_METRIC (INFORMRewardModel): avg=0.603194
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.591931
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.719246
    MetaMetrics (metametrics_score): avg=0.511209
Dataset: evalgenproduct_grade
  sensitivity:
    Autometrics (Autometrics_Regression_grade): avg=0.126582
    BEST_METRIC (PRM_min): avg=0.076389
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.211757
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.229167
    MetaMetrics (metametrics_score): avg=0.116579
  stability:
    Autometrics (Autometrics_Regression_grade): avg=0.632569
    BEST_METRIC (PRM_min): avg=0.852500
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.893605
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.605000
    MetaMetrics (metametrics_score): avg=0.695019
  binary:
    Autometrics (Autometrics_Regression_grade): avg=0.533333
    BEST_METRIC (PRM_min): avg=0.616667
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.800000
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.433333
    MetaMetrics (metametrics_score): avg=0.750000
  aggregated:
    Autometrics (Autometrics_Regression_grade): avg=0.379575
    BEST_METRIC (PRM_min): avg=0.464444
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.552681
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.417083
    MetaMetrics (metametrics_score): avg=0.405799
Dataset: helpsteer2_helpfulness
  sensitivity:
    Autometrics (Autometrics_Regression_helpfulness): avg=0.373306
    BEST_METRIC (INFORMRewardModel): avg=0.270154
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.347737
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.388889
    MetaMetrics (metametrics_score): avg=0.129868
  stability:
    Autometrics (Autometrics_Regression_helpfulness): avg=0.882325
    BEST_METRIC (INFORMRewardModel): avg=0.908045
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.840617
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.862500
    MetaMetrics (metametrics_score): avg=0.783047
  binary:
    Autometrics (Autometrics_Regression_helpfulness): avg=0.977778
    BEST_METRIC (INFORMRewardModel): avg=1.000000
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.855556
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.822222
    MetaMetrics (metametrics_score): avg=0.688889
  aggregated:
    Autometrics (Autometrics_Regression_helpfulness): avg=0.627815
    BEST_METRIC (INFORMRewardModel): avg=0.589099
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.594177
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.625694
    MetaMetrics (metametrics_score): avg=0.456457
Dataset: primock57_time_sec
  sensitivity:
    Autometrics (Autometrics_Regression_time_sec): avg=-0.181845
    BEST_METRIC (LevenshteinDistance_min): avg=-0.230716
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.028968
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=-0.070764
  stability:
    Autometrics (Autometrics_Regression_time_sec): avg=0.511078
    BEST_METRIC (LevenshteinDistance_min): avg=0.709766
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.793672
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.875428
  binary:
    Autometrics (Autometrics_Regression_time_sec): avg=0.033333
    BEST_METRIC (LevenshteinDistance_min): avg=0.033333
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.611111
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.411111
  aggregated:
    Autometrics (Autometrics_Regression_time_sec): avg=0.164616
    BEST_METRIC (LevenshteinDistance_min): avg=0.239525
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.411320
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.402332
Dataset: realhumaneval_accepted
  sensitivity:
    Autometrics (Autometrics_Regression_accepted): avg=0.005129
    BEST_METRIC (GRMRewardModel): avg=-0.023929
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.042492
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.080000
    MetaMetrics (metametrics_score): avg=0.081008
  stability:
    Autometrics (Autometrics_Regression_accepted): avg=0.591773
    BEST_METRIC (GRMRewardModel): avg=0.683259
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.597529
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.366667
    MetaMetrics (metametrics_score): avg=0.692632
  binary:
    Autometrics (Autometrics_Regression_accepted): avg=0.577778
    BEST_METRIC (GRMRewardModel): avg=0.466667
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.488889
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.111111
    MetaMetrics (metametrics_score): avg=0.566667
  aggregated:
    Autometrics (Autometrics_Regression_accepted): avg=0.298451
    BEST_METRIC (GRMRewardModel): avg=0.329665
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.320011
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.223333
    MetaMetrics (metametrics_score): avg=0.386820
Dataset: simpeval_score
  sensitivity:
    Autometrics (Autometrics_Regression_score): avg=0.284602
    BEST_METRIC (LENS): avg=0.145630
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.388062
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.324000
    MetaMetrics (metametrics_score): avg=0.021111
  stability:
    Autometrics (Autometrics_Regression_score): avg=0.860123
    BEST_METRIC (LENS): avg=0.868991
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.889583
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.871667
    MetaMetrics (metametrics_score): avg=0.835467
  binary:
    Autometrics (Autometrics_Regression_score): avg=0.888889
    BEST_METRIC (LENS): avg=0.677778
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.900000
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.844444
    MetaMetrics (metametrics_score): avg=0.533333
  aggregated:
    Autometrics (Autometrics_Regression_score): avg=0.572363
    BEST_METRIC (LENS): avg=0.507311
    DNAEval (DNAEval-qwen3_32b-seed42): avg=0.638823
    LLMJudge (LLMJudge-qwen3_32b-seed42): avg=0.597833
    MetaMetrics (metametrics_score): avg=0.428289
