{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.7528802588996764,
    "precision": 0.6517857142857143,
    "recall": 0.5428571428571428,
    "f1": 0.5923553277813367,
    "true_positives": 1387,
    "true_negatives": 4429,
    "false_positives": 741,
    "false_negatives": 1168,
    "total": 7725,
    "normalized_precision": 0.5688667338541049,
    "normalized_recall": 0.43334808579274503,
    "selection_accuracy": 0.34444444444444444,
    "solvable_selection_accuracy": 0.5961538461538461
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.75078125,
    "precision": 0.6430707876370887,
    "recall": 0.45956537228357675,
    "f1": 0.5360482027841262,
    "true_positives": 1290,
    "true_negatives": 5437,
    "false_positives": 716,
    "false_negatives": 1517,
    "total": 8960,
    "normalized_precision": 0.5007243423401558,
    "normalized_recall": 0.3831291759993952,
    "selection_accuracy": 0.3,
    "solvable_selection_accuracy": 0.5192307692307693
  },
  "Llama-3.3-70B-Instruct_verdicts": {
    "accuracy": 0.48528751123090746,
    "precision": 0.3498150016818029,
    "recall": 0.7436539149088309,
    "f1": 0.4758092188036143,
    "true_positives": 2080,
    "true_negatives": 2241,
    "false_positives": 3866,
    "false_negatives": 717,
    "total": 8904,
    "normalized_precision": 0.5495200992779717,
    "normalized_recall": 0.7067920710772422,
    "selection_accuracy": 0.3111111111111111,
    "solvable_selection_accuracy": 0.5384615384615384
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.5577162238552855,
    "precision": 0.3790322580645161,
    "recall": 0.6443001443001443,
    "f1": 0.47728487439871725,
    "true_positives": 1786,
    "true_negatives": 3147,
    "false_positives": 2926,
    "false_negatives": 986,
    "total": 8845,
    "normalized_precision": 0.5487051757363988,
    "normalized_recall": 0.6163940858601316,
    "selection_accuracy": 0.3111111111111111,
    "solvable_selection_accuracy": 0.5384615384615384
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.6329128310035231,
    "precision": 0.43526170798898073,
    "recall": 0.572463768115942,
    "f1": 0.4945226917057903,
    "true_positives": 1580,
    "true_negatives": 3989,
    "false_positives": 2050,
    "false_negatives": 1180,
    "total": 8799,
    "normalized_precision": 0.5167028681441481,
    "normalized_recall": 0.5239537895782779,
    "selection_accuracy": 0.28888888888888886,
    "solvable_selection_accuracy": 0.5
  },
  "SkyT1_verdicts": {
    "accuracy": 0.5196492876154689,
    "precision": 0.3880450759431651,
    "recall": 0.7353760445682451,
    "f1": 0.5080179602309173,
    "true_positives": 1584,
    "true_negatives": 1735,
    "false_positives": 2498,
    "false_negatives": 570,
    "total": 6387,
    "normalized_precision": 0.5583084000203651,
    "normalized_recall": 0.5328032165277773,
    "selection_accuracy": 0.3,
    "solvable_selection_accuracy": 0.5192307692307693
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.6605343332552145,
    "precision": 0.464123206160308,
    "recall": 0.49257057949479943,
    "f1": 0.47792395026130835,
    "true_positives": 1326,
    "true_negatives": 4311,
    "false_positives": 1531,
    "false_negatives": 1366,
    "total": 8534,
    "normalized_precision": 0.5364185597855633,
    "normalized_recall": 0.40330421765271457,
    "selection_accuracy": 0.3333333333333333,
    "solvable_selection_accuracy": 0.5769230769230769
  }
}