{
  "armor_rm_score": {
    "accuracy": 0.298,
    "precision": 0.0991274495780289,
    "recall": 0.9719495091164095,
    "f1": 0.1799065420560748,
    "true_positives": 693,
    "true_negatives": 1989,
    "false_positives": 6298,
    "false_negatives": 20,
    "total": 9000,
    "normalized_precision": 0.27389989398536024,
    "normalized_recall": 0.9157282681476231,
    "selection_accuracy": 0.12222222222222222,
    "solvable_selection_accuracy": 0.39285714285714285
  },
  "eurus_prm2_scores": {
    "accuracy": 0.0791111111111111,
    "precision": 0.07911990221135681,
    "recall": 0.9985974754558204,
    "f1": 0.14662273476112023,
    "true_positives": 712,
    "true_negatives": 0,
    "false_positives": 8287,
    "false_negatives": 1,
    "total": 9000,
    "normalized_precision": 0.25439393939393934,
    "normalized_recall": 0.9988479262672811,
    "selection_accuracy": 0.15555555555555556,
    "solvable_selection_accuracy": 0.5
  },
  "eurus_prm_scores": {
    "accuracy": 0.0791111111111111,
    "precision": 0.07911990221135681,
    "recall": 0.9985974754558204,
    "f1": 0.14662273476112023,
    "true_positives": 712,
    "true_negatives": 0,
    "false_positives": 8287,
    "false_negatives": 1,
    "total": 9000,
    "normalized_precision": 0.25439393939393934,
    "normalized_recall": 0.9988479262672811,
    "selection_accuracy": 0.08888888888888889,
    "solvable_selection_accuracy": 0.2857142857142857
  },
  "gpm_scores": {
    "accuracy": 0.4638888888888889,
    "precision": 0.10170476559473073,
    "recall": 0.7363253856942497,
    "f1": 0.17872340425531913,
    "true_positives": 525,
    "true_negatives": 3650,
    "false_positives": 4637,
    "false_negatives": 188,
    "total": 9000,
    "normalized_precision": 0.29079942982009327,
    "normalized_recall": 0.6640813102073897,
    "selection_accuracy": 0.06666666666666667,
    "solvable_selection_accuracy": 0.21428571428571427
  },
  "grm_gemma_scores": {
    "accuracy": 0.3688888888888889,
    "precision": 0.09065435965056864,
    "recall": 0.7713884992987378,
    "f1": 0.1622418879056047,
    "true_positives": 550,
    "true_negatives": 2770,
    "false_positives": 5517,
    "false_negatives": 163,
    "total": 9000,
    "normalized_precision": 0.28849364275230893,
    "normalized_recall": 0.6532072840199178,
    "selection_accuracy": 0.07777777777777778,
    "solvable_selection_accuracy": 0.25
  },
  "grm_llama32_scores": {
    "accuracy": 0.3838888888888889,
    "precision": 0.10625814863102999,
    "recall": 0.9144460028050491,
    "f1": 0.1903927580668711,
    "true_positives": 652,
    "true_negatives": 2803,
    "false_positives": 5484,
    "false_negatives": 61,
    "total": 9000,
    "normalized_precision": 0.2820121906345588,
    "normalized_recall": 0.8452512622135048,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.32142857142857145
  },
  "grm_scores": {
    "accuracy": 0.42533333333333334,
    "precision": 0.1054680587506636,
    "recall": 0.8359046283309958,
    "f1": 0.18730358265241986,
    "true_positives": 596,
    "true_negatives": 3232,
    "false_positives": 5055,
    "false_negatives": 117,
    "total": 9000,
    "normalized_precision": 0.2781903884030966,
    "normalized_recall": 0.7531982548344105,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.32142857142857145
  },
  "internlm2_scores": {
    "accuracy": 0.41955555555555557,
    "precision": 0.1142466221994185,
    "recall": 0.9368863955119214,
    "f1": 0.20365853658536584,
    "true_positives": 668,
    "true_negatives": 3108,
    "false_positives": 5179,
    "false_negatives": 45,
    "total": 9000,
    "normalized_precision": 0.29016454519811125,
    "normalized_recall": 0.8870378169274604,
    "selection_accuracy": 0.17777777777777778,
    "solvable_selection_accuracy": 0.5714285714285714
  },
  "internlm_scores": {
    "accuracy": 0.44522222222222224,
    "precision": 0.11689939133548156,
    "recall": 0.9158485273492286,
    "f1": 0.20733449753929195,
    "true_positives": 653,
    "true_negatives": 3354,
    "false_positives": 4933,
    "false_negatives": 60,
    "total": 9000,
    "normalized_precision": 0.2931369135016669,
    "normalized_recall": 0.9102802182418518,
    "selection_accuracy": 0.13333333333333333,
    "solvable_selection_accuracy": 0.42857142857142855
  },
  "offset_bias_scores": {
    "accuracy": 0.35888888888888887,
    "precision": 0.10299890092636206,
    "recall": 0.9200561009817672,
    "f1": 0.18525840158147414,
    "true_positives": 656,
    "true_negatives": 2574,
    "false_positives": 5713,
    "false_negatives": 57,
    "total": 9000,
    "normalized_precision": 0.27895125380511676,
    "normalized_recall": 0.8588265608865188,
    "selection_accuracy": 0.08888888888888889,
    "solvable_selection_accuracy": 0.2857142857142857
  },
  "qrm_gemma_scores": {
    "accuracy": 0.08522222222222223,
    "precision": 0.07904164800716525,
    "recall": 0.9901823281907434,
    "f1": 0.14639709694142045,
    "true_positives": 706,
    "true_negatives": 61,
    "false_positives": 8226,
    "false_negatives": 7,
    "total": 9000,
    "normalized_precision": 0.25488306575827074,
    "normalized_recall": 0.9927838057220946,
    "selection_accuracy": 0.07777777777777778,
    "solvable_selection_accuracy": 0.25
  },
  "qrm_scores": {
    "accuracy": 0.45422222222222225,
    "precision": 0.11540575196922513,
    "recall": 0.8835904628330996,
    "f1": 0.20414776409591706,
    "true_positives": 630,
    "true_negatives": 3458,
    "false_positives": 4829,
    "false_negatives": 83,
    "total": 9000,
    "normalized_precision": 0.2973418367212079,
    "normalized_recall": 0.7881614903903048,
    "selection_accuracy": 0.12222222222222222,
    "solvable_selection_accuracy": 0.39285714285714285
  },
  "qwen25_math_scores": {
    "accuracy": 0.7196666666666667,
    "precision": 0.21030729833546735,
    "recall": 0.9214586255259467,
    "f1": 0.3424550430023455,
    "true_positives": 657,
    "true_negatives": 5820,
    "false_positives": 2467,
    "false_negatives": 56,
    "total": 9000,
    "normalized_precision": 0.4504719638987725,
    "normalized_recall": 0.7782244587613858,
    "selection_accuracy": 0.2111111111111111,
    "solvable_selection_accuracy": 0.6785714285714286
  },
  "skywork_gemma_scores": {
    "accuracy": 0.4008888888888889,
    "precision": 0.10568009438732513,
    "recall": 0.879382889200561,
    "f1": 0.18868492326211256,
    "true_positives": 627,
    "true_negatives": 2981,
    "false_positives": 5306,
    "false_negatives": 86,
    "total": 9000,
    "normalized_precision": 0.30562239662142104,
    "normalized_recall": 0.7879874182885921,
    "selection_accuracy": 0.14444444444444443,
    "solvable_selection_accuracy": 0.4642857142857143
  },
  "skyworks_scores": {
    "accuracy": 0.663,
    "precision": 0.17082860385925086,
    "recall": 0.844319775596073,
    "f1": 0.28416332310597125,
    "true_positives": 602,
    "true_negatives": 5365,
    "false_positives": 2922,
    "false_negatives": 111,
    "total": 9000,
    "normalized_precision": 0.31991233092035054,
    "normalized_recall": 0.7075036325319598,
    "selection_accuracy": 0.12222222222222222,
    "solvable_selection_accuracy": 0.39285714285714285
  },
  "urm_scores": {
    "accuracy": 0.07922222222222222,
    "precision": 0.07912869526561458,
    "recall": 0.9985974754558204,
    "f1": 0.14663783338482134,
    "true_positives": 712,
    "true_negatives": 1,
    "false_positives": 8286,
    "false_negatives": 1,
    "total": 9000,
    "normalized_precision": 0.2545043731778425,
    "normalized_recall": 0.9988479262672811,
    "selection_accuracy": 0.14444444444444443,
    "solvable_selection_accuracy": 0.4642857142857143
  }
}