{
  "judge_qwen2-72b-instruct_verdicts": {
    "accuracy": 0.62907,
    "precision": 0.5663393132276692,
    "recall": 0.8994595284705081,
    "f1": 0.6950466559789534,
    "true_positives": 42271,
    "true_negatives": 20636,
    "false_positives": 32368,
    "false_negatives": 4725,
    "total": 100000,
    "normalized_precision": 0.5020446681642258,
    "normalized_recall": 0.8067349477327536,
    "selection_accuracy": 0.49,
    "solvable_selection_accuracy": 0.494949494949495
  },
  "judge_qwen2.5-72b-instruct-turbo_verdicts": {
    "accuracy": 0.70882,
    "precision": 0.6349935062973814,
    "recall": 0.8947144437824496,
    "f1": 0.7428056600773756,
    "true_positives": 42048,
    "true_negatives": 28834,
    "false_positives": 24170,
    "false_negatives": 4948,
    "total": 100000,
    "normalized_precision": 0.5304347545046565,
    "normalized_recall": 0.7687439391760791,
    "selection_accuracy": 0.48,
    "solvable_selection_accuracy": 0.48484848484848486
  },
  "judge_qwq-32b-preview_verdicts": {
    "accuracy": 0.5310932446909699,
    "precision": 0.9444444444444444,
    "recall": 0.0014499861398383693,
    "f1": 0.002895526836849837,
    "true_positives": 68,
    "true_negatives": 52976,
    "false_positives": 4,
    "false_negatives": 46829,
    "total": 99877,
    "normalized_precision": 0.049910873440285206,
    "normalized_recall": 0.0007313498274664989,
    "selection_accuracy": 0.43,
    "solvable_selection_accuracy": 0.43434343434343436
  },
  "judge_nous-hermes-2-mixtral-8x7b-dpo_verdicts": {
    "accuracy": 0.53758,
    "precision": 0.5042440616908702,
    "recall": 0.95310239169291,
    "f1": 0.6595497180215865,
    "true_positives": 44792,
    "true_negatives": 8966,
    "false_positives": 44038,
    "false_negatives": 2204,
    "total": 100000,
    "normalized_precision": 0.4874893202174422,
    "normalized_recall": 0.9279593640095867,
    "selection_accuracy": 0.41,
    "solvable_selection_accuracy": 0.41414141414141414
  },
  "judge_llama-3.1-nemotron-70b-instruct-hf_verdicts": {
    "accuracy": 0.5383097867968148,
    "precision": 0.715203426124197,
    "recall": 0.0074000221557549576,
    "f1": 0.014648480329810096,
    "true_positives": 334,
    "true_negatives": 52057,
    "false_positives": 133,
    "false_negatives": 44801,
    "total": 97325,
    "normalized_precision": 0.18838164997519333,
    "normalized_recall": 0.00513332799260114,
    "selection_accuracy": 0.44,
    "solvable_selection_accuracy": 0.4444444444444444
  },
  "judge_meta-llama-3.1-405b-instruct-turbo_verdicts": {
    "accuracy": 0.6206058302154819,
    "precision": 0.5583424772477893,
    "recall": 0.9209607358828038,
    "f1": 0.6952077088138617,
    "true_positives": 43252,
    "true_negatives": 18785,
    "false_positives": 34213,
    "false_negatives": 3712,
    "total": 99962,
    "normalized_precision": 0.49863167488622945,
    "normalized_recall": 0.8380424418615536,
    "selection_accuracy": 0.48,
    "solvable_selection_accuracy": 0.48484848484848486
  },
  "judge_gemma-2-27b-it_verdicts": {
    "accuracy": 0.62639,
    "precision": 0.5737692366587551,
    "recall": 0.7973018980338752,
    "f1": 0.6673137371884489,
    "true_positives": 37470,
    "true_negatives": 25169,
    "false_positives": 27835,
    "false_negatives": 9526,
    "total": 100000,
    "normalized_precision": 0.4985836450492602,
    "normalized_recall": 0.7185541829623073,
    "selection_accuracy": 0.48,
    "solvable_selection_accuracy": 0.48484848484848486
  },
  "judge_claude-3-5-sonnet-latest_verdicts": {
    "accuracy": 0.67165,
    "precision": 0.6036661249469261,
    "recall": 0.8773299855306834,
    "f1": 0.7152137523092534,
    "true_positives": 41231,
    "true_negatives": 25934,
    "false_positives": 27070,
    "false_negatives": 5765,
    "total": 100000,
    "normalized_precision": 0.5083569229174985,
    "normalized_recall": 0.7727803435241097,
    "selection_accuracy": 0.51,
    "solvable_selection_accuracy": 0.5151515151515151
  },
  "judge_llama-3.3-70b-instruct-turbo_verdicts": {
    "accuracy": 0.62078,
    "precision": 0.558284730608155,
    "recall": 0.9247169971912503,
    "f1": 0.6962303144875759,
    "true_positives": 43458,
    "true_negatives": 18620,
    "false_positives": 34384,
    "false_negatives": 3538,
    "total": 100000,
    "normalized_precision": 0.4992402319442421,
    "normalized_recall": 0.8608534709747907,
    "selection_accuracy": 0.46,
    "solvable_selection_accuracy": 0.46464646464646464
  },
  "judge_gpt-4o_verdicts": {
    "accuracy": 0.69628,
    "precision": 0.6669679803945201,
    "recall": 0.7065069367605754,
    "f1": 0.6861683440451342,
    "true_positives": 33203,
    "true_negatives": 36425,
    "false_positives": 16579,
    "false_negatives": 13793,
    "total": 100000,
    "normalized_precision": 0.5099794645598937,
    "normalized_recall": 0.5478563275457821,
    "selection_accuracy": 0.53,
    "solvable_selection_accuracy": 0.5353535353535354
  },
  "gpt-4o_verdicts_v1": {
    "accuracy": 0.8403491736157689,
    "precision": 0.7942889668911868,
    "recall": 0.8912899997870937,
    "f1": 0.8399983947669403,
    "true_positives": 41863,
    "true_negatives": 42082,
    "false_positives": 10842,
    "false_negatives": 5106,
    "total": 99893,
    "normalized_precision": 0.6293157996802314,
    "normalized_recall": 0.7016576794562649,
    "selection_accuracy": 0.65,
    "solvable_selection_accuracy": 0.6565656565656566
  },
  "claude-3-5-sonnet-latest_verdicts_v1": {
    "accuracy": 0.74137,
    "precision": 0.6580793800397947,
    "recall": 0.9359945527278918,
    "f1": 0.7728107239171109,
    "true_positives": 43988,
    "true_negatives": 30149,
    "false_positives": 22855,
    "false_negatives": 3008,
    "total": 100000,
    "normalized_precision": 0.55030863154208,
    "normalized_recall": 0.8434995651619684,
    "selection_accuracy": 0.51,
    "solvable_selection_accuracy": 0.5151515151515151
  },
  "gpt-4o-mini_verdicts_v1": {
    "accuracy": 0.8385261745235305,
    "precision": 0.8040240654896933,
    "recall": 0.868194598278947,
    "f1": 0.8348780762573865,
    "true_positives": 40760,
    "true_negatives": 42966,
    "false_positives": 9935,
    "false_negatives": 6188,
    "total": 99849,
    "normalized_precision": 0.6253184901960723,
    "normalized_recall": 0.7033024655169894,
    "selection_accuracy": 0.65,
    "solvable_selection_accuracy": 0.6565656565656566
  },
  "claude-3-5-haiku-latest_verdicts_v1": {
    "accuracy": 0.6344687493745872,
    "precision": 0.5654746978283766,
    "recall": 0.9601260298469333,
    "f1": 0.7117550047739665,
    "true_positives": 45100,
    "true_negatives": 18305,
    "false_positives": 34656,
    "false_negatives": 1873,
    "total": 99934,
    "normalized_precision": 0.5157659852191098,
    "normalized_recall": 0.9148144816959358,
    "selection_accuracy": 0.49,
    "solvable_selection_accuracy": 0.494949494949495
  },
  "Qwen/Qwen2.5-72B-Instruct-Turbo_verdicts_v1": {
    "accuracy": 0.7592421997036087,
    "precision": 0.6742176943048449,
    "recall": 0.9449492477603048,
    "f1": 0.7869497412632026,
    "true_positives": 44406,
    "true_negatives": 31418,
    "false_positives": 21457,
    "false_negatives": 2587,
    "total": 99868,
    "normalized_precision": 0.5624898522019361,
    "normalized_recall": 0.8220231174513691,
    "selection_accuracy": 0.57,
    "solvable_selection_accuracy": 0.5757575757575758
  },
  "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo_verdicts_v1": {
    "accuracy": 0.5941041022844331,
    "precision": 0.5395942525047243,
    "recall": 0.9690101165322065,
    "f1": 0.693186762853544,
    "true_positives": 45402,
    "true_negatives": 13425,
    "false_positives": 38739,
    "false_negatives": 1452,
    "total": 99018,
    "normalized_precision": 0.5067093709228109,
    "normalized_recall": 0.9149503845642936,
    "selection_accuracy": 0.5,
    "solvable_selection_accuracy": 0.5050505050505051
  },
  "meta-llama/Llama-3.3-70B-Instruct-Turbo_verdicts_v1": {
    "accuracy": 0.5879300790892147,
    "precision": 0.534853864748268,
    "recall": 0.9854454791484646,
    "f1": 0.693375715956458,
    "true_positives": 46244,
    "true_negatives": 12111,
    "false_positives": 40217,
    "false_negatives": 683,
    "total": 99255,
    "normalized_precision": 0.5049440776195812,
    "normalized_recall": 0.948282467136881,
    "selection_accuracy": 0.49,
    "solvable_selection_accuracy": 0.494949494949495
  }
}