{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.905553606921548,
    "precision": 0.9252537438694957,
    "recall": 0.9673960732697159,
    "f1": 0.9458557314637318,
    "true_positives": 35279,
    "true_negatives": 3447,
    "false_positives": 2850,
    "false_negatives": 1189,
    "total": 42765,
    "normalized_precision": 0.8436859361910192,
    "normalized_recall": 0.8310881581762494,
    "selection_accuracy": 0.836,
    "solvable_selection_accuracy": 0.847870182555781
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.8497221995261051,
    "precision": 0.8883215534371306,
    "recall": 0.9237066485327196,
    "f1": 0.905668602787501,
    "true_positives": 35317,
    "true_negatives": 6282,
    "false_positives": 4440,
    "false_negatives": 2917,
    "total": 48956,
    "normalized_precision": 0.8186891475864756,
    "normalized_recall": 0.8340805773297473,
    "selection_accuracy": 0.802,
    "solvable_selection_accuracy": 0.8133874239350912
  },
  "Llama-3.3-70B-Instruct_verdicts": {
    "accuracy": 0.8075418714271387,
    "precision": 0.8170320404721754,
    "recall": 0.9709169385710248,
    "f1": 0.8873522194958733,
    "true_positives": 37791,
    "true_negatives": 2469,
    "false_positives": 8463,
    "false_negatives": 1132,
    "total": 49855,
    "normalized_precision": 0.8006864914310137,
    "normalized_recall": 0.9362193437977456,
    "selection_accuracy": 0.794,
    "solvable_selection_accuracy": 0.8052738336713996
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.8122724698648977,
    "precision": 0.8266497065452679,
    "recall": 0.9607334595533885,
    "f1": 0.888662316476346,
    "true_positives": 37043,
    "true_negatives": 3119,
    "false_positives": 7768,
    "false_negatives": 1514,
    "total": 49444,
    "normalized_precision": 0.8016466300622361,
    "normalized_recall": 0.918258423999222,
    "selection_accuracy": 0.794,
    "solvable_selection_accuracy": 0.8052738336713996
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.8197172194605167,
    "precision": 0.846985036762115,
    "recall": 0.9391800375078141,
    "f1": 0.8907031607030372,
    "true_positives": 36057,
    "true_negatives": 4178,
    "false_positives": 6514,
    "false_negatives": 2335,
    "total": 49084,
    "normalized_precision": 0.8080342716537329,
    "normalized_recall": 0.8769799924555759,
    "selection_accuracy": 0.798,
    "solvable_selection_accuracy": 0.8093306288032455
  },
  "SkyT1_verdicts": {
    "accuracy": 0.8449297065331575,
    "precision": 0.8629652996845426,
    "recall": 0.9577626529983475,
    "f1": 0.9078961342395921,
    "true_positives": 34195,
    "true_negatives": 3608,
    "false_positives": 5430,
    "false_negatives": 1508,
    "total": 44741,
    "normalized_precision": 0.8154169680055013,
    "normalized_recall": 0.8267615858487501,
    "selection_accuracy": 0.808,
    "solvable_selection_accuracy": 0.8194726166328601
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.8096998829111974,
    "precision": 0.854344895207746,
    "recall": 0.9129796987557302,
    "f1": 0.8826896289730277,
    "true_positives": 34853,
    "true_negatives": 4564,
    "false_positives": 5942,
    "false_negatives": 3322,
    "total": 48681,
    "normalized_precision": 0.8058778505583042,
    "normalized_recall": 0.8387975348179918,
    "selection_accuracy": 0.804,
    "solvable_selection_accuracy": 0.8154158215010142
  }
}