{
  "ArmorRM_scores": {
    "accuracy": 0.45944,
    "precision": 0.4572496815810017,
    "recall": 0.44610574417765664,
    "f1": 0.45160897617984824,
    "true_positives": 11129,
    "true_negatives": 11843,
    "false_positives": 13210,
    "false_negatives": 13818,
    "total": 50000,
    "normalized_precision": 0.504863295440632,
    "normalized_recall": 0.4848151391719254,
    "selection_accuracy": 0.526,
    "solvable_selection_accuracy": 0.530241935483871
  },
  "EurusPRMStage1_avg_scores": {
    "accuracy": 0.50138,
    "precision": 0.500160481444333,
    "recall": 0.9994388102777889,
    "f1": 0.6666844926935572,
    "true_positives": 24933,
    "true_negatives": 136,
    "false_positives": 24917,
    "false_negatives": 14,
    "total": 50000,
    "normalized_precision": 0.5041098129642676,
    "normalized_recall": 0.9996465490294951,
    "selection_accuracy": 0.682,
    "solvable_selection_accuracy": 0.6875
  },
  "EurusPRMStage1_max_scores": {
    "accuracy": 0.50868,
    "precision": 0.9399538106235565,
    "recall": 0.01631458692427947,
    "f1": 0.032072498029944836,
    "true_positives": 407,
    "true_negatives": 25027,
    "false_positives": 26,
    "false_negatives": 24540,
    "total": 50000,
    "normalized_precision": 0.2758736559139785,
    "normalized_recall": 0.010047851756311059,
    "selection_accuracy": 0.666,
    "solvable_selection_accuracy": 0.6713709677419355
  },
  "EurusPRMStage1_min_scores": {
    "accuracy": 0.49868,
    "precision": 0.4988091187478734,
    "recall": 0.9989978754960517,
    "f1": 0.6653851288212522,
    "true_positives": 24922,
    "true_negatives": 12,
    "false_positives": 25041,
    "false_negatives": 25,
    "total": 50000,
    "normalized_precision": 0.5029859723677744,
    "normalized_recall": 0.9993615959593246,
    "selection_accuracy": 0.694,
    "solvable_selection_accuracy": 0.6995967741935484
  },
  "EurusPRMStage2_avg_scores": {
    "accuracy": 0.5017,
    "precision": 0.500321220638426,
    "recall": 0.9989577905158937,
    "f1": 0.6667201733616919,
    "true_positives": 24921,
    "true_negatives": 164,
    "false_positives": 24889,
    "false_negatives": 26,
    "total": 50000,
    "normalized_precision": 0.5042980659424893,
    "normalized_recall": 0.9991065634774705,
    "selection_accuracy": 0.7,
    "solvable_selection_accuracy": 0.7056451612903226
  },
  "EurusPRMStage2_max_scores": {
    "accuracy": 0.50256,
    "precision": 0.9629629629629629,
    "recall": 0.003126628452318916,
    "f1": 0.006233019018699057,
    "true_positives": 78,
    "true_negatives": 25050,
    "false_positives": 3,
    "false_negatives": 24869,
    "total": 50000,
    "normalized_precision": 0.1028225806451613,
    "normalized_recall": 0.0018915940245605816,
    "selection_accuracy": 0.686,
    "solvable_selection_accuracy": 0.6915322580645161
  },
  "EurusPRMStage2_min_scores": {
    "accuracy": 0.49848,
    "precision": 0.498708476001682,
    "recall": 0.9983565158135247,
    "f1": 0.6651532955880781,
    "true_positives": 24906,
    "true_negatives": 18,
    "false_positives": 25035,
    "false_negatives": 41,
    "total": 50000,
    "normalized_precision": 0.5029384138702946,
    "normalized_recall": 0.9986533480033032,
    "selection_accuracy": 0.708,
    "solvable_selection_accuracy": 0.7137096774193549
  },
  "GPM_scores": {
    "accuracy": 0.54168,
    "precision": 0.528648404660479,
    "recall": 0.7511524431795407,
    "f1": 0.6205583336092989,
    "true_positives": 18739,
    "true_negatives": 8345,
    "false_positives": 16708,
    "false_negatives": 6208,
    "total": 50000,
    "normalized_precision": 0.5045196501315024,
    "normalized_recall": 0.7149602531802347,
    "selection_accuracy": 0.504,
    "solvable_selection_accuracy": 0.5080645161290323
  },
  "GRMGemma_scores": {
    "accuracy": 0.50896,
    "precision": 0.5287189181329068,
    "recall": 0.145748987854251,
    "f1": 0.2285067873303167,
    "true_positives": 3636,
    "true_negatives": 21812,
    "false_positives": 3241,
    "false_negatives": 21311,
    "total": 50000,
    "normalized_precision": 0.5233176990158368,
    "normalized_recall": 0.1414065668536807,
    "selection_accuracy": 0.528,
    "solvable_selection_accuracy": 0.532258064516129
  },
  "GRMLlama32_scores": {
    "accuracy": 0.50102,
    "precision": 0.49984885126964934,
    "recall": 0.13256102938229045,
    "f1": 0.20954915565694007,
    "true_positives": 3307,
    "true_negatives": 21744,
    "false_positives": 3309,
    "false_negatives": 21640,
    "total": 50000,
    "normalized_precision": 0.5005874039393917,
    "normalized_recall": 0.13312239329366785,
    "selection_accuracy": 0.538,
    "solvable_selection_accuracy": 0.5423387096774194
  },
  "GRM_scores": {
    "accuracy": 0.50926,
    "precision": 0.5573266219239373,
    "recall": 0.0798893654547641,
    "f1": 0.1397468709462539,
    "true_positives": 1993,
    "true_negatives": 23470,
    "false_positives": 1583,
    "false_negatives": 22954,
    "total": 50000,
    "normalized_precision": 0.4657445471806081,
    "normalized_recall": 0.0713798807965893,
    "selection_accuracy": 0.528,
    "solvable_selection_accuracy": 0.532258064516129
  },
  "InternLM2Reward7B_scores": {
    "accuracy": 0.53936,
    "precision": 0.7374163153979668,
    "recall": 0.11921273098969816,
    "f1": 0.20524499654934436,
    "true_positives": 2974,
    "true_negatives": 23994,
    "false_positives": 1059,
    "false_negatives": 21973,
    "total": 50000,
    "normalized_precision": 0.5639232812261779,
    "normalized_recall": 0.09574857821835575,
    "selection_accuracy": 0.594,
    "solvable_selection_accuracy": 0.5987903225806451
  },
  "OffsetBias_scores": {
    "accuracy": 0.45984,
    "precision": 0.44077245818725214,
    "recall": 0.30741171283120217,
    "f1": 0.36220658385679877,
    "true_positives": 7669,
    "true_negatives": 15323,
    "false_positives": 9730,
    "false_negatives": 17278,
    "total": 50000,
    "normalized_precision": 0.4946872823603289,
    "normalized_recall": 0.34448427269597093,
    "selection_accuracy": 0.542,
    "solvable_selection_accuracy": 0.5463709677419355
  },
  "QRM_scores": {
    "accuracy": 0.49872,
    "precision": 0.4978874002383446,
    "recall": 0.5526516214374474,
    "f1": 0.5238420912648657,
    "true_positives": 13787,
    "true_negatives": 11149,
    "false_positives": 13904,
    "false_negatives": 11160,
    "total": 50000,
    "normalized_precision": 0.5222784123985008,
    "normalized_recall": 0.563551628599703,
    "selection_accuracy": 0.53,
    "solvable_selection_accuracy": 0.5342741935483871
  },
  "QwenPRM_avg_scores": {
    "accuracy": 0.51918,
    "precision": 0.5093317402768622,
    "recall": 0.9911011344049385,
    "f1": 0.6728715080758187,
    "true_positives": 24725,
    "true_negatives": 1234,
    "false_positives": 23819,
    "false_negatives": 222,
    "total": 50000,
    "normalized_precision": 0.5082675027622067,
    "normalized_recall": 0.9848498353400036,
    "selection_accuracy": 0.732,
    "solvable_selection_accuracy": 0.7379032258064516
  },
  "QwenPRM_max_scores": {
    "accuracy": 0.49916,
    "precision": 0.4990496388627679,
    "recall": 0.9998396600793683,
    "f1": 0.6657858210548794,
    "true_positives": 24943,
    "true_negatives": 15,
    "false_positives": 25038,
    "false_negatives": 4,
    "total": 50000,
    "normalized_precision": 0.5029942700748431,
    "normalized_recall": 0.9998449269737221,
    "selection_accuracy": 0.526,
    "solvable_selection_accuracy": 0.530241935483871
  },
  "QwenPRM_min_scores": {
    "accuracy": 0.783,
    "precision": 0.8950288628593847,
    "recall": 0.6401571331222191,
    "f1": 0.7464360831970087,
    "true_positives": 15970,
    "true_negatives": 23180,
    "false_positives": 1873,
    "false_negatives": 8977,
    "total": 50000,
    "normalized_precision": 0.6784856793882075,
    "normalized_recall": 0.48366972441439376,
    "selection_accuracy": 0.732,
    "solvable_selection_accuracy": 0.7379032258064516
  },
  "Skyworks_scores": {
    "accuracy": 0.51502,
    "precision": 0.5714578214578214,
    "recall": 0.11187717962079609,
    "f1": 0.18712078039623212,
    "true_positives": 2791,
    "true_negatives": 22960,
    "false_positives": 2093,
    "false_negatives": 22156,
    "total": 50000,
    "normalized_precision": 0.5300385776326765,
    "normalized_recall": 0.1007604851565749,
    "selection_accuracy": 0.534,
    "solvable_selection_accuracy": 0.5383064516129032
  },
  "URM_scores": {
    "accuracy": 0.52114,
    "precision": 0.6065365025466893,
    "recall": 0.11456287329137772,
    "f1": 0.19272396237229847,
    "true_positives": 2858,
    "true_negatives": 23199,
    "false_positives": 1854,
    "false_negatives": 22089,
    "total": 50000,
    "normalized_precision": 0.5270236824150353,
    "normalized_recall": 0.09671229108751826,
    "selection_accuracy": 0.548,
    "solvable_selection_accuracy": 0.5524193548387096
  }
}