{
  "armor_rm_score": {
    "accuracy": 0.55787,
    "precision": 0.5172812061449809,
    "recall": 0.8862881947399779,
    "f1": 0.6532776022020594,
    "true_positives": 41652,
    "true_negatives": 14135,
    "false_positives": 38869,
    "false_negatives": 5344,
    "total": 100000,
    "normalized_precision": 0.4861989218156106,
    "normalized_recall": 0.8647895853491636,
    "selection_accuracy": 0.5,
    "solvable_selection_accuracy": 0.5050505050505051
  },
  "eurus_prm2_scores": {
    "accuracy": 0.47003,
    "precision": 0.4699928995029652,
    "recall": 1.0,
    "f1": 0.6394492104851384,
    "true_positives": 46996,
    "true_negatives": 7,
    "false_positives": 52997,
    "false_negatives": 0,
    "total": 100000,
    "normalized_precision": 0.4747090644613302,
    "normalized_recall": 1.0,
    "selection_accuracy": 0.68,
    "solvable_selection_accuracy": 0.6868686868686869
  },
  "eurus_prm_scores": {
    "accuracy": 0.47109,
    "precision": 0.4704904738544097,
    "recall": 0.9999574431866541,
    "f1": 0.6399008707847956,
    "true_positives": 46994,
    "true_negatives": 115,
    "false_positives": 52889,
    "false_negatives": 2,
    "total": 100000,
    "normalized_precision": 0.47483429437705704,
    "normalized_recall": 0.9998784229150722,
    "selection_accuracy": 0.59,
    "solvable_selection_accuracy": 0.5959595959595959
  },
  "gpm_scores": {
    "accuracy": 0.50264,
    "precision": 0.48438639679066375,
    "recall": 0.9043748404119499,
    "f1": 0.6308742763841472,
    "true_positives": 42502,
    "true_negatives": 7762,
    "false_positives": 45242,
    "false_negatives": 4494,
    "total": 100000,
    "normalized_precision": 0.4785555373877076,
    "normalized_recall": 0.8879423723706124,
    "selection_accuracy": 0.4,
    "solvable_selection_accuracy": 0.40404040404040403
  },
  "grm_gemma_scores": {
    "accuracy": 0.55074,
    "precision": 0.5116986165114387,
    "recall": 0.9632947484892331,
    "f1": 0.6683644846015295,
    "true_positives": 45271,
    "true_negatives": 9803,
    "false_positives": 43201,
    "false_negatives": 1725,
    "total": 100000,
    "normalized_precision": 0.48504706379392176,
    "normalized_recall": 0.9321968470496226,
    "selection_accuracy": 0.49,
    "solvable_selection_accuracy": 0.494949494949495
  },
  "grm_llama32_scores": {
    "accuracy": 0.57126,
    "precision": 0.5255993044342318,
    "recall": 0.900417056770789,
    "f1": 0.6637491569024202,
    "true_positives": 42316,
    "true_negatives": 14810,
    "false_positives": 38194,
    "false_negatives": 4680,
    "total": 100000,
    "normalized_precision": 0.48086635645529263,
    "normalized_recall": 0.8574879830009852,
    "selection_accuracy": 0.62,
    "solvable_selection_accuracy": 0.6262626262626263
  },
  "grm_scores": {
    "accuracy": 0.61771,
    "precision": 0.5580050548490824,
    "recall": 0.8972891309898715,
    "f1": 0.6880971223902028,
    "true_positives": 42169,
    "true_negatives": 19602,
    "false_positives": 33402,
    "false_negatives": 4827,
    "total": 100000,
    "normalized_precision": 0.49124104507607025,
    "normalized_recall": 0.8327503405049456,
    "selection_accuracy": 0.51,
    "solvable_selection_accuracy": 0.5151515151515151
  },
  "inform_scores": {
    "accuracy": 0.6626,
    "precision": 0.6999002991026919,
    "recall": 0.9047687240441071,
    "f1": 0.7892567145534041,
    "true_positives": 6318,
    "true_negatives": 308,
    "false_positives": 2709,
    "false_negatives": 665,
    "total": 10000,
    "normalized_precision": 0.701116175178249,
    "normalized_recall": 0.90639850811324,
    "selection_accuracy": 0.5,
    "solvable_selection_accuracy": 0.5
  },
  "internlm2_scores": {
    "accuracy": 0.70351,
    "precision": 0.654094195817862,
    "recall": 0.7834070984764661,
    "f1": 0.712934364803501,
    "true_positives": 36817,
    "true_negatives": 33534,
    "false_positives": 19470,
    "false_negatives": 10179,
    "total": 100000,
    "normalized_precision": 0.5179745955912058,
    "normalized_recall": 0.6288392246751151,
    "selection_accuracy": 0.59,
    "solvable_selection_accuracy": 0.5959595959595959
  },
  "internlm_scores": {
    "accuracy": 0.67304,
    "precision": 0.6024707635863334,
    "recall": 0.8945016597157205,
    "f1": 0.7200013701914844,
    "true_positives": 42038,
    "true_negatives": 25266,
    "false_positives": 27738,
    "false_negatives": 4958,
    "total": 100000,
    "normalized_precision": 0.5160638405247149,
    "normalized_recall": 0.8122442824247675,
    "selection_accuracy": 0.57,
    "solvable_selection_accuracy": 0.5757575757575758
  },
  "offset_bias_scores": {
    "accuracy": 0.51817,
    "precision": 0.4937433453863102,
    "recall": 0.9965954549323347,
    "f1": 0.660336258855874,
    "true_positives": 46836,
    "true_negatives": 4981,
    "false_positives": 48023,
    "false_negatives": 160,
    "total": 100000,
    "normalized_precision": 0.4818223192790992,
    "normalized_recall": 0.9956264708702314,
    "selection_accuracy": 0.58,
    "solvable_selection_accuracy": 0.5858585858585859
  },
  "qrm_gemma_correctness": {
    "accuracy": 0.48685,
    "precision": 0.4774108515779454,
    "recall": 0.9711464805515363,
    "f1": 0.640134647077387,
    "true_positives": 45640,
    "true_negatives": 3045,
    "false_positives": 49959,
    "false_negatives": 1356,
    "total": 100000,
    "normalized_precision": 0.4773566885107549,
    "normalized_recall": 0.9784057979548942,
    "selection_accuracy": 0.54,
    "solvable_selection_accuracy": 0.5454545454545454
  },
  "qrm_scores": {
    "accuracy": 0.58091,
    "precision": 0.5293425469815304,
    "recall": 0.9763596901863988,
    "f1": 0.6864952610357646,
    "true_positives": 45885,
    "true_negatives": 12206,
    "false_positives": 40798,
    "false_negatives": 1111,
    "total": 100000,
    "normalized_precision": 0.4880503805681773,
    "normalized_recall": 0.9266746951172807,
    "selection_accuracy": 0.46,
    "solvable_selection_accuracy": 0.46464646464646464
  },
  "qwen25_math_scores": {
    "accuracy": 0.68882,
    "precision": 0.6075541225241824,
    "recall": 0.954251425653247,
    "f1": 0.742421984935022,
    "true_positives": 44846,
    "true_negatives": 24036,
    "false_positives": 28968,
    "false_negatives": 2150,
    "total": 100000,
    "normalized_precision": 0.5481625963615212,
    "normalized_recall": 0.861587869985998,
    "selection_accuracy": 0.74,
    "solvable_selection_accuracy": 0.7474747474747475
  },
  "skyworks_scores": {
    "accuracy": 0.57583,
    "precision": 0.527410954803951,
    "recall": 0.9373563707549579,
    "f1": 0.675017813225458,
    "true_positives": 44052,
    "true_negatives": 13531,
    "false_positives": 39473,
    "false_negatives": 2944,
    "total": 100000,
    "normalized_precision": 0.49129559478464785,
    "normalized_recall": 0.9062251154184999,
    "selection_accuracy": 0.6,
    "solvable_selection_accuracy": 0.6060606060606061
  },
  "urm_scores": {
    "accuracy": 0.61281,
    "precision": 0.5515065526640033,
    "recall": 0.942910034896587,
    "f1": 0.6959519415760336,
    "true_positives": 44313,
    "true_negatives": 16968,
    "false_positives": 36036,
    "false_negatives": 2683,
    "total": 100000,
    "normalized_precision": 0.4935839482202463,
    "normalized_recall": 0.8499087466151912,
    "selection_accuracy": 0.54,
    "solvable_selection_accuracy": 0.5454545454545454
  },
  "armor_rm_correctness": {
    "accuracy": 0.56303,
    "precision": 0.5190396491025567,
    "recall": 0.956826112860669,
    "f1": 0.6730025218699255,
    "true_positives": 44967,
    "true_negatives": 11336,
    "false_positives": 41668,
    "false_negatives": 2029,
    "total": 100000,
    "normalized_precision": 0.48997054484295866,
    "normalized_recall": 0.91737555825745,
    "selection_accuracy": 0.48,
    "solvable_selection_accuracy": 0.48484848484848486
  }
}