{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.7793969002695418,
    "precision": 0.7953179453507753,
    "recall": 0.9299846444601937,
    "f1": 0.8573956603414009,
    "true_positives": 31493,
    "true_negatives": 5519,
    "false_positives": 8105,
    "false_negatives": 2371,
    "total": 47488,
    "normalized_precision": 0.8117781624809675,
    "normalized_recall": 0.8416752178482331,
    "selection_accuracy": 0.754,
    "solvable_selection_accuracy": 0.8195652173913044
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.7570941653389266,
    "precision": 0.7749073917148875,
    "recall": 0.9209564641901419,
    "f1": 0.8416429998948144,
    "true_positives": 32006,
    "true_negatives": 5533,
    "false_positives": 9297,
    "false_negatives": 2747,
    "total": 49583,
    "normalized_precision": 0.7921086802458868,
    "normalized_recall": 0.8607629499088114,
    "selection_accuracy": 0.734,
    "solvable_selection_accuracy": 0.7978260869565217
  },
  "Llama-3.3-70B-Instruct_verdicts": {
    "accuracy": 0.7291219482838172,
    "precision": 0.7291867670075685,
    "recall": 0.9750014334040479,
    "f1": 0.8343657618095506,
    "true_positives": 34010,
    "true_negatives": 2336,
    "false_positives": 12631,
    "false_negatives": 872,
    "total": 49849,
    "normalized_precision": 0.771620976235271,
    "normalized_recall": 0.9534664257207581,
    "selection_accuracy": 0.712,
    "solvable_selection_accuracy": 0.7739130434782608
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.7313064578617892,
    "precision": 0.7355593250602624,
    "recall": 0.9612366924310923,
    "f1": 0.8333902312584557,
    "true_positives": 32956,
    "true_negatives": 2908,
    "false_positives": 11848,
    "false_negatives": 1329,
    "total": 49041,
    "normalized_precision": 0.773951809045721,
    "normalized_recall": 0.9231245800027728,
    "selection_accuracy": 0.716,
    "solvable_selection_accuracy": 0.7782608695652173
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.7259594043823897,
    "precision": 0.743306505546203,
    "recall": 0.929331372718623,
    "f1": 0.8259744253389026,
    "true_positives": 32232,
    "true_negatives": 3748,
    "false_positives": 11131,
    "false_negatives": 2451,
    "total": 49562,
    "normalized_precision": 0.7758267936081484,
    "normalized_recall": 0.8974662364035987,
    "selection_accuracy": 0.714,
    "solvable_selection_accuracy": 0.7760869565217391
  },
  "SkyT1_verdicts": {
    "accuracy": 0.7536447209583063,
    "precision": 0.7648670724316918,
    "recall": 0.945046716801771,
    "f1": 0.8454638124362894,
    "true_positives": 29029,
    "true_negatives": 3435,
    "false_positives": 8924,
    "false_negatives": 1688,
    "total": 43076,
    "normalized_precision": 0.7856742845753544,
    "normalized_recall": 0.7851527752443775,
    "selection_accuracy": 0.722,
    "solvable_selection_accuracy": 0.7847826086956522
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.7333660391243024,
    "precision": 0.7551476418154297,
    "recall": 0.9167809118221859,
    "f1": 0.8281513490725128,
    "true_positives": 31430,
    "true_negatives": 4447,
    "false_positives": 10191,
    "false_negatives": 2853,
    "total": 48921,
    "normalized_precision": 0.7779432918535536,
    "normalized_recall": 0.8577004050091565,
    "selection_accuracy": 0.732,
    "solvable_selection_accuracy": 0.7956521739130434
  }
}