{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.8620672644080757,
    "precision": 0.8782362196533295,
    "recall": 0.9737597139520466,
    "f1": 0.9235344792222177,
    "true_positives": 67539,
    "true_negatives": 2360,
    "false_positives": 9364,
    "false_negatives": 1820,
    "total": 81083,
    "normalized_precision": 0.8733368291431108,
    "normalized_recall": 0.9351925751852797,
    "selection_accuracy": 0.8391356542617047,
    "solvable_selection_accuracy": 0.8421686746987952
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.8573501336387934,
    "precision": 0.8700485257661269,
    "recall": 0.9799061395431728,
    "f1": 0.921715443179437,
    "true_positives": 65981,
    "true_negatives": 1381,
    "false_positives": 9855,
    "false_negatives": 1353,
    "total": 78570,
    "normalized_precision": 0.8632862994490902,
    "normalized_recall": 0.9167762439414646,
    "selection_accuracy": 0.8403361344537815,
    "solvable_selection_accuracy": 0.8433734939759037
  },
  "Llama-3.3-70B-Instruct_verdicts": {
    "accuracy": 0.8530367979963395,
    "precision": 0.8565456894240379,
    "recall": 0.993862861829546,
    "f1": 0.9201091830255742,
    "true_positives": 70283,
    "true_negatives": 560,
    "false_positives": 11771,
    "false_negatives": 434,
    "total": 83048,
    "normalized_precision": 0.8580117428692378,
    "normalized_recall": 0.986886549984267,
    "selection_accuracy": 0.8307322929171669,
    "solvable_selection_accuracy": 0.8337349397590361
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.8527081629414421,
    "precision": 0.8551541633331282,
    "recall": 0.995729129344321,
    "f1": 0.9201032975764799,
    "true_positives": 69477,
    "true_negatives": 376,
    "false_positives": 11768,
    "false_negatives": 298,
    "total": 81919,
    "normalized_precision": 0.8563522000954523,
    "normalized_recall": 0.9763206139500543,
    "selection_accuracy": 0.8307322929171669,
    "solvable_selection_accuracy": 0.8337349397590361
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.8439106402022328,
    "precision": 0.8576121445903068,
    "recall": 0.9792282336894749,
    "f1": 0.9143941253341646,
    "true_positives": 68922,
    "true_negatives": 850,
    "false_positives": 11443,
    "false_negatives": 1462,
    "total": 82677,
    "normalized_precision": 0.8578290366333862,
    "normalized_recall": 0.9637903713724919,
    "selection_accuracy": 0.8343337334933973,
    "solvable_selection_accuracy": 0.8373493975903614
  },
  "SkyT1_verdicts": {
    "accuracy": 0.8567272533162517,
    "precision": 0.8634745948313622,
    "recall": 0.9880646576029071,
    "f1": 0.9215777940102263,
    "true_positives": 63082,
    "true_negatives": 1116,
    "false_positives": 9974,
    "false_negatives": 762,
    "total": 74934,
    "normalized_precision": 0.8643351140771375,
    "normalized_recall": 0.8843699176610906,
    "selection_accuracy": 0.8355342136854742,
    "solvable_selection_accuracy": 0.8385542168674699
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.841645900443688,
    "precision": 0.8610491637100862,
    "recall": 0.9705634506891381,
    "f1": 0.912532312753886,
    "true_positives": 67954,
    "true_negatives": 1284,
    "false_positives": 10966,
    "false_negatives": 2061,
    "total": 82265,
    "normalized_precision": 0.8589187343046825,
    "normalized_recall": 0.9476174704404131,
    "selection_accuracy": 0.8271308523409364,
    "solvable_selection_accuracy": 0.8301204819277108
  }
}