[
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_outcomeRating",
    "avg": 0.8724251641817485
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.9286591527042916
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.9133597883597885
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.9087301587301587
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "stability",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.8076957125535698
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_outcomeRating",
    "avg": 0.36299376570430797
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.2777287869880462
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.2705026455026455
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.5297619047619048
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "sensitivity",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.2147214172140876
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_outcomeRating",
    "avg": 0.8095238095238095
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.9523809523809523
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8412698412698413
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8888888888888888
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "binary",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.7619047619047619
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_outcomeRating",
    "avg": 0.6177094649430283
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.6031939698461689
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.591931216931217
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.7192460317460317
  },
  {
    "dataset": "cogymtraveloutcome_outcomerating",
    "type": "aggregated",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.5112085648838287
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_grade",
    "avg": 0.6325689621410233
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "PRM_min",
    "avg": 0.8525
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8936046511627908
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.605
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "stability",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.6950186322337555
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_grade",
    "avg": 0.1265818303904043
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "PRM_min",
    "avg": 0.0763888888888889
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.21175710594315242
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.22916666666666666
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "sensitivity",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.11657921527815905
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_grade",
    "avg": 0.5333333333333333
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "PRM_min",
    "avg": 0.6166666666666667
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.43333333333333335
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "binary",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.75
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_grade",
    "avg": 0.3795753962657138
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "PRM_min",
    "avg": 0.46444444444444444
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.5526808785529715
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.4170833333333333
  },
  {
    "dataset": "evalgenproduct_grade",
    "type": "aggregated",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.4057989237559573
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_helpfulness",
    "avg": 0.8823245137414221
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.9080451269363167
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8406174074074074
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8625
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "stability",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.7830467554195392
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_helpfulness",
    "avg": 0.3733059100443233
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.27015353078982596
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.34773679012345676
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.3888888888888889
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "sensitivity",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.12986752389202405
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_helpfulness",
    "avg": 0.9777777777777777
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 1.0
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8555555555555555
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8222222222222222
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "binary",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.6888888888888889
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_helpfulness",
    "avg": 0.6278152118928727
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "INFORMRewardModel",
    "avg": 0.5890993288630713
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.5941770987654321
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.6256944444444444
  },
  {
    "dataset": "helpsteer2_helpfulness",
    "type": "aggregated",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.4564571396557816
  },
  {
    "dataset": "primock57_time_sec",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_time_sec",
    "avg": 0.5110778988790811
  },
  {
    "dataset": "primock57_time_sec",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "LevenshteinDistance_min",
    "avg": 0.7097655839377695
  },
  {
    "dataset": "primock57_time_sec",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.7936724386724386
  },
  {
    "dataset": "primock57_time_sec",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8754282407407408
  },
  {
    "dataset": "primock57_time_sec",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_time_sec",
    "avg": -0.1818452083226738
  },
  {
    "dataset": "primock57_time_sec",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "LevenshteinDistance_min",
    "avg": -0.23071586250394197
  },
  {
    "dataset": "primock57_time_sec",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.02896825396825396
  },
  {
    "dataset": "primock57_time_sec",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": -0.07076388888888889
  },
  {
    "dataset": "primock57_time_sec",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_time_sec",
    "avg": 0.03333333333333333
  },
  {
    "dataset": "primock57_time_sec",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "LevenshteinDistance_min",
    "avg": 0.03333333333333333
  },
  {
    "dataset": "primock57_time_sec",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.6111111111111112
  },
  {
    "dataset": "primock57_time_sec",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.4111111111111111
  },
  {
    "dataset": "primock57_time_sec",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_time_sec",
    "avg": 0.16461634527820362
  },
  {
    "dataset": "primock57_time_sec",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "LevenshteinDistance_min",
    "avg": 0.23952486071691376
  },
  {
    "dataset": "primock57_time_sec",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.4113203463203463
  },
  {
    "dataset": "primock57_time_sec",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.40233217592592596
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_accepted",
    "avg": 0.5917728792777179
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "GRMRewardModel",
    "avg": 0.6832591493570722
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.5975292592592593
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.36666666666666664
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "stability",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.6926322751085101
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_accepted",
    "avg": 0.005128555578887534
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "GRMRewardModel",
    "avg": -0.023928968842729968
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.042492469135802464
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.08
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "sensitivity",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.08100791433591296
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_accepted",
    "avg": 0.5777777777777777
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "GRMRewardModel",
    "avg": 0.4666666666666667
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.4888888888888889
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.1111111111111111
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "binary",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.5666666666666667
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_accepted",
    "avg": 0.29845071742830276
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "GRMRewardModel",
    "avg": 0.3296650902571711
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.3200108641975309
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.22333333333333333
  },
  {
    "dataset": "realhumaneval_accepted",
    "type": "aggregated",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.3868200947222115
  },
  {
    "dataset": "simpeval_score",
    "type": "stability",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_score",
    "avg": 0.8601234622544088
  },
  {
    "dataset": "simpeval_score",
    "type": "stability",
    "metric_logical": "BEST_METRIC",
    "metric": "LENS",
    "avg": 0.8689911034969261
  },
  {
    "dataset": "simpeval_score",
    "type": "stability",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.8895833333333333
  },
  {
    "dataset": "simpeval_score",
    "type": "stability",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8716666666666667
  },
  {
    "dataset": "simpeval_score",
    "type": "stability",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.835467446896094
  },
  {
    "dataset": "simpeval_score",
    "type": "sensitivity",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_score",
    "avg": 0.2846024786005716
  },
  {
    "dataset": "simpeval_score",
    "type": "sensitivity",
    "metric_logical": "BEST_METRIC",
    "metric": "LENS",
    "avg": 0.1456299313249547
  },
  {
    "dataset": "simpeval_score",
    "type": "sensitivity",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.38806216931216925
  },
  {
    "dataset": "simpeval_score",
    "type": "sensitivity",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.324
  },
  {
    "dataset": "simpeval_score",
    "type": "sensitivity",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.021110654356926956
  },
  {
    "dataset": "simpeval_score",
    "type": "binary",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_score",
    "avg": 0.8888888888888888
  },
  {
    "dataset": "simpeval_score",
    "type": "binary",
    "metric_logical": "BEST_METRIC",
    "metric": "LENS",
    "avg": 0.6777777777777778
  },
  {
    "dataset": "simpeval_score",
    "type": "binary",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.9
  },
  {
    "dataset": "simpeval_score",
    "type": "binary",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.8444444444444444
  },
  {
    "dataset": "simpeval_score",
    "type": "binary",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.5333333333333333
  },
  {
    "dataset": "simpeval_score",
    "type": "aggregated",
    "metric_logical": "Autometrics",
    "metric": "Autometrics_Regression_score",
    "avg": 0.5723629704274902
  },
  {
    "dataset": "simpeval_score",
    "type": "aggregated",
    "metric_logical": "BEST_METRIC",
    "metric": "LENS",
    "avg": 0.5073105174109404
  },
  {
    "dataset": "simpeval_score",
    "type": "aggregated",
    "metric_logical": "DNAEval",
    "metric": "DNAEval-qwen3_32b-seed42",
    "avg": 0.6388227513227512
  },
  {
    "dataset": "simpeval_score",
    "type": "aggregated",
    "metric_logical": "LLMJudge",
    "metric": "LLMJudge-qwen3_32b-seed42",
    "avg": 0.5978333333333333
  },
  {
    "dataset": "simpeval_score",
    "type": "aggregated",
    "metric_logical": "MetaMetrics",
    "metric": "metametrics_score",
    "avg": 0.42828905062651046
  }
]