{
  "judge_gpt-4o_verdicts": {
    "accuracy": 0.799,
    "precision": 0.24699907663896584,
    "recall": 0.7503506311360448,
    "f1": 0.37165682528655786,
    "true_positives": 535,
    "true_negatives": 6656,
    "false_positives": 1631,
    "false_negatives": 178,
    "total": 9000,
    "normalized_precision": 0.30464814591318345,
    "normalized_recall": 0.4503200143531028,
    "selection_accuracy": 0.07777777777777778,
    "solvable_selection_accuracy": 0.25
  },
  "judge_llama-3.1-nemotron-70b-instruct-hf_verdicts": {
    "accuracy": 0.8485555555555555,
    "precision": 0.2988861386138614,
    "recall": 0.6774193548387096,
    "f1": 0.41477028767711466,
    "true_positives": 483,
    "true_negatives": 7154,
    "false_positives": 1133,
    "false_negatives": 230,
    "total": 9000,
    "normalized_precision": 0.3704233943829651,
    "normalized_recall": 0.447823385884857,
    "selection_accuracy": 0.08888888888888889,
    "solvable_selection_accuracy": 0.2857142857142857
  },
  "judge_llama-3.3-70b-instruct-turbo_verdicts": {
    "accuracy": 0.6431111111111111,
    "precision": 0.16883116883116883,
    "recall": 0.8934081346423562,
    "f1": 0.2839946500222916,
    "true_positives": 637,
    "true_negatives": 5151,
    "false_positives": 3136,
    "false_negatives": 76,
    "total": 9000,
    "normalized_precision": 0.3196516895918681,
    "normalized_recall": 0.695282307663739,
    "selection_accuracy": 0.1,
    "solvable_selection_accuracy": 0.32142857142857145
  },
  "judge_meta-llama-3.1-405b-instruct-turbo_verdicts": {
    "accuracy": 0.7054444444444444,
    "precision": 0.1971875,
    "recall": 0.884992987377279,
    "f1": 0.3225146946077178,
    "true_positives": 631,
    "true_negatives": 5718,
    "false_positives": 2569,
    "false_negatives": 82,
    "total": 9000,
    "normalized_precision": 0.34209090332285214,
    "normalized_recall": 0.7039216944389677,
    "selection_accuracy": 0.08888888888888889,
    "solvable_selection_accuracy": 0.2857142857142857
  },
  "judge_nous-hermes-2-mixtral-8x7b-dpo_verdicts": {
    "accuracy": 0.19555555555555557,
    "precision": 0.08484925581987024,
    "recall": 0.9354838709677419,
    "f1": 0.15558665733613247,
    "true_positives": 667,
    "true_negatives": 1093,
    "false_positives": 7194,
    "false_negatives": 46,
    "total": 9000,
    "normalized_precision": 0.2668030395246162,
    "normalized_recall": 0.9519602183014746,
    "selection_accuracy": 0.06666666666666667,
    "solvable_selection_accuracy": 0.21428571428571427
  },
  "judge_qwen2-72b-instruct_verdicts": {
    "accuracy": 0.7434444444444445,
    "precision": 0.20422535211267606,
    "recall": 0.7727910238429172,
    "f1": 0.3230724127821753,
    "true_positives": 551,
    "true_negatives": 6140,
    "false_positives": 2147,
    "false_negatives": 162,
    "total": 9000,
    "normalized_precision": 0.33510238047066737,
    "normalized_recall": 0.6495756267550761,
    "selection_accuracy": 0.08888888888888889,
    "solvable_selection_accuracy": 0.2857142857142857
  },
  "judge_qwen2.5-72b-instruct-turbo_verdicts": {
    "accuracy": 0.4978888888888889,
    "precision": 0.13005443234836703,
    "recall": 0.938288920056101,
    "f1": 0.22844459620966365,
    "true_positives": 669,
    "true_negatives": 3812,
    "false_positives": 4475,
    "false_negatives": 44,
    "total": 9000,
    "normalized_precision": 0.3124346943453911,
    "normalized_recall": 0.8580708620334555,
    "selection_accuracy": 0.06666666666666667,
    "solvable_selection_accuracy": 0.21428571428571427
  },
  "judge_qwq-32b-preview_verdicts": {
    "accuracy": 0.9338888888888889,
    "precision": 0.9538461538461539,
    "recall": 0.17391304347826086,
    "f1": 0.2941874258600237,
    "true_positives": 124,
    "true_negatives": 8281,
    "false_positives": 6,
    "false_negatives": 589,
    "total": 9000,
    "normalized_precision": 0.21428571428571427,
    "normalized_recall": 0.05470280715814783,
    "selection_accuracy": 0.07777777777777778,
    "solvable_selection_accuracy": 0.25
  }
}