{
  "judge_claude-3-5-sonnet-latest_verdicts": {
    "accuracy": 0.8866142857142857,
    "precision": 0.37359981408319776,
    "recall": 0.7702922855773838,
    "f1": 0.5031611893583724,
    "true_positives": 8038,
    "true_negatives": 116088,
    "false_positives": 13477,
    "false_negatives": 2397,
    "total": 140000,
    "normalized_precision": 0.2770312134335934,
    "normalized_recall": 0.48781277357014,
    "selection_accuracy": 0.09285714285714286,
    "solvable_selection_accuracy": 0.325
  },
  "judge_gemma-2-27b-it_verdicts": {
    "accuracy": 0.3071634636085715,
    "precision": 0.09393362965195118,
    "recall": 0.9682585905649388,
    "f1": 0.17125345511356804,
    "true_positives": 9975,
    "true_negatives": 32827,
    "false_positives": 96217,
    "false_negatives": 327,
    "total": 139346,
    "normalized_precision": 0.27098333692677307,
    "normalized_recall": 0.8755997684935813,
    "selection_accuracy": 0.07857142857142857,
    "solvable_selection_accuracy": 0.275
  },
  "judge_gpt-4o_verdicts": {
    "accuracy": 0.9139071428571428,
    "precision": 0.43077186376861204,
    "recall": 0.48241494968854814,
    "f1": 0.4551331314135889,
    "true_positives": 5034,
    "true_negatives": 122913,
    "false_positives": 6652,
    "false_negatives": 5401,
    "total": 140000,
    "normalized_precision": 0.26971446034294894,
    "normalized_recall": 0.2675997260062934,
    "selection_accuracy": 0.07857142857142857,
    "solvable_selection_accuracy": 0.275
  },
  "judge_llama-3.1-nemotron-70b-instruct-hf_verdicts": {
    "accuracy": 0.9298785714285714,
    "precision": 0.5708065994500459,
    "recall": 0.2387158600862482,
    "f1": 0.3366443678626934,
    "true_positives": 2491,
    "true_negatives": 127692,
    "false_positives": 1873,
    "false_negatives": 7944,
    "total": 140000,
    "normalized_precision": 0.22713783853812744,
    "normalized_recall": 0.11999688687612937,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.35
  },
  "judge_llama-3.3-70b-instruct-turbo_verdicts": {
    "accuracy": 0.8585775612682948,
    "precision": 0.32936074057875936,
    "recall": 0.8660277910876857,
    "f1": 0.4772265202122884,
    "true_positives": 9037,
    "true_negatives": 111163,
    "false_positives": 18401,
    "false_negatives": 1398,
    "total": 139999,
    "normalized_precision": 0.3419119758241181,
    "normalized_recall": 0.6657182966237013,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.35
  },
  "judge_meta-llama-3.1-405b-instruct-turbo_verdicts": {
    "accuracy": 0.8714642857142857,
    "precision": 0.2987649063032368,
    "recall": 0.5378054623862003,
    "f1": 0.384133611691023,
    "true_positives": 5612,
    "true_negatives": 116393,
    "false_positives": 13172,
    "false_negatives": 4823,
    "total": 140000,
    "normalized_precision": 0.279973562225054,
    "normalized_recall": 0.4712742359779328,
    "selection_accuracy": 0.10714285714285714,
    "solvable_selection_accuracy": 0.375
  },
  "judge_mixtral-8x22b-instruct-v0.1_verdicts": {
    "accuracy": 0.9254642857142857,
    "precision": 0,
    "recall": 0.0,
    "f1": 0,
    "true_positives": 0,
    "true_negatives": 129565,
    "false_positives": 0,
    "false_negatives": 10435,
    "total": 140000,
    "normalized_precision": 0.0,
    "normalized_recall": 0.0,
    "selection_accuracy": 0.07142857142857142,
    "solvable_selection_accuracy": 0.25
  },
  "judge_nous-hermes-2-mixtral-8x7b-dpo_verdicts": {
    "accuracy": 0.2606785714285714,
    "precision": 0.08078842585085491,
    "recall": 0.8594154288452324,
    "f1": 0.14769311846905084,
    "true_positives": 8968,
    "true_negatives": 27527,
    "false_positives": 102038,
    "false_negatives": 1467,
    "total": 140000,
    "normalized_precision": 0.25374208303056833,
    "normalized_recall": 0.8386687205461827,
    "selection_accuracy": 0.07142857142857142,
    "solvable_selection_accuracy": 0.25
  },
  "judge_qwen2-72b-instruct_verdicts": {
    "accuracy": 0.7936142857142857,
    "precision": 0.19911980440097798,
    "recall": 0.5853378054623862,
    "f1": 0.29715397713451713,
    "true_positives": 6108,
    "true_negatives": 104998,
    "false_positives": 24567,
    "false_negatives": 4327,
    "total": 140000,
    "normalized_precision": 0.2536739371603771,
    "normalized_recall": 0.4536911623804437,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.35
  },
  "judge_qwen2.5-72b-instruct-turbo_verdicts": {
    "accuracy": 0.7713714285714286,
    "precision": 0.21594290680220157,
    "recall": 0.785816962146622,
    "f1": 0.3387869773591142,
    "true_positives": 8200,
    "true_negatives": 99792,
    "false_positives": 29773,
    "false_negatives": 2235,
    "total": 140000,
    "normalized_precision": 0.2955356203292204,
    "normalized_recall": 0.6401973266277768,
    "selection_accuracy": 0.08571428571428572,
    "solvable_selection_accuracy": 0.3
  },
  "judge_qwq-32b-preview_verdicts": {
    "accuracy": 0.9209656769168257,
    "precision": 0.8666666666666667,
    "recall": 0.0012596899224806201,
    "f1": 0.002515723270440252,
    "true_positives": 13,
    "true_negatives": 120115,
    "false_positives": 2,
    "false_negatives": 10307,
    "total": 130437,
    "normalized_precision": 0.025,
    "normalized_recall": 0.000325,
    "selection_accuracy": 0.07142857142857142,
    "solvable_selection_accuracy": 0.25
  },
  "judge_wizardlm-2-8x22b_verdicts": {
    "accuracy": 0.9254642857142857,
    "precision": 0,
    "recall": 0.0,
    "f1": 0,
    "true_positives": 0,
    "true_negatives": 129565,
    "false_positives": 0,
    "false_negatives": 10435,
    "total": 140000,
    "normalized_precision": 0.0,
    "normalized_recall": 0.0,
    "selection_accuracy": 0.07142857142857142,
    "solvable_selection_accuracy": 0.25
  }
}