{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.7457841170789689,
    "precision": 0.7017265872582231,
    "recall": 0.9026567332021367,
    "f1": 0.7896095911466339,
    "true_positives": 12843,
    "true_negatives": 7235,
    "false_positives": 5459,
    "false_negatives": 1385,
    "total": 26922,
    "normalized_precision": 0.5891658900112026,
    "normalized_recall": 0.34284587359647006,
    "selection_accuracy": 0.5,
    "solvable_selection_accuracy": 0.6175908221797323
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.6977102970781359,
    "precision": 0.6186089933094756,
    "recall": 0.8391304347826087,
    "f1": 0.7121898790864308,
    "true_positives": 19879,
    "true_negatives": 17205,
    "false_positives": 12256,
    "false_negatives": 3811,
    "total": 53151,
    "normalized_precision": 0.5713146599391897,
    "normalized_recall": 0.6041036232803491,
    "selection_accuracy": 0.47678018575851394,
    "solvable_selection_accuracy": 0.5889101338432122
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.7543101416020996,
    "precision": 0.7121394449914306,
    "recall": 0.7161874724952325,
    "f1": 0.7141577224771901,
    "true_positives": 19529,
    "true_negatives": 28467,
    "false_positives": 7894,
    "false_negatives": 7739,
    "total": 63629,
    "normalized_precision": 0.5913914819870439,
    "normalized_recall": 0.5582090977686873,
    "selection_accuracy": 0.48761609907120745,
    "solvable_selection_accuracy": 0.6022944550669216
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.6162242500759113,
    "precision": 0.5303168022701577,
    "recall": 0.8942764215484691,
    "f1": 0.6658038298819862,
    "true_positives": 23921,
    "true_negatives": 14638,
    "false_positives": 21186,
    "false_negatives": 2828,
    "total": 62573,
    "normalized_precision": 0.5520292091968696,
    "normalized_recall": 0.7952790550477753,
    "selection_accuracy": 0.45975232198142413,
    "solvable_selection_accuracy": 0.5678776290630975
  },
  "SkyT1_verdicts": {
    "accuracy": 0.7004169148302561,
    "precision": 0.6133282347215463,
    "recall": 0.8358838300090018,
    "f1": 0.7075170934172799,
    "true_positives": 17643,
    "true_negatives": 16461,
    "false_positives": 11123,
    "false_negatives": 3464,
    "total": 48691,
    "normalized_precision": 0.5656761097122569,
    "normalized_recall": 0.5235625794944914,
    "selection_accuracy": 0.47523219814241485,
    "solvable_selection_accuracy": 0.5869980879541109
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.7501882272416154,
    "precision": 0.7561504708376473,
    "recall": 0.6137254901960785,
    "f1": 0.677534071079893,
    "true_positives": 15337,
    "true_negatives": 28504,
    "false_positives": 4946,
    "false_negatives": 9653,
    "total": 58440,
    "normalized_precision": 0.607091151899326,
    "normalized_recall": 0.4076408318057177,
    "selection_accuracy": 0.4845201238390093,
    "solvable_selection_accuracy": 0.5984703632887189
  }
}