{
  "DeepSeekLlama70B_verdicts": {
    "accuracy": 0.879899269410404,
    "precision": 0.9012989537106022,
    "recall": 0.9634849455477258,
    "f1": 0.9313550741010386,
    "true_positives": 55648,
    "true_negatives": 4450,
    "false_positives": 6094,
    "false_negatives": 2109,
    "total": 68301,
    "normalized_precision": 0.8900276907229256,
    "normalized_recall": 0.9022940331721244,
    "selection_accuracy": 0.8595271210013908,
    "solvable_selection_accuracy": 0.8956521739130435
  },
  "DeepSeekQwen32B_verdicts": {
    "accuracy": 0.85473411154345,
    "precision": 0.8807205452775073,
    "recall": 0.9539191671769749,
    "f1": 0.9158596135944211,
    "true_positives": 56079,
    "true_negatives": 4549,
    "false_positives": 7595,
    "false_negatives": 2709,
    "total": 70932,
    "normalized_precision": 0.8777334112451773,
    "normalized_recall": 0.9100182120731201,
    "selection_accuracy": 0.847009735744089,
    "solvable_selection_accuracy": 0.8826086956521739
  },
  "Llama-3.3-70B-Instruct_verdicts": {
    "accuracy": 0.8421797891431135,
    "precision": 0.8511684575186425,
    "recall": 0.9810838705311067,
    "f1": 0.9115203533841088,
    "true_positives": 57985,
    "true_negatives": 2086,
    "false_positives": 10139,
    "false_negatives": 1118,
    "total": 71328,
    "normalized_precision": 0.8659243582225913,
    "normalized_recall": 0.9560447913468444,
    "selection_accuracy": 0.8331015299026425,
    "solvable_selection_accuracy": 0.8681159420289855
  },
  "Mixtral-8x22B-Instruct-v0.1_verdicts": {
    "accuracy": 0.8383635917920025,
    "precision": 0.8469383143782481,
    "recall": 0.9817013549378405,
    "f1": 0.9093541004552915,
    "true_positives": 56224,
    "true_negatives": 1914,
    "false_positives": 10161,
    "false_negatives": 1048,
    "total": 69347,
    "normalized_precision": 0.8653954126607253,
    "normalized_recall": 0.9294423567562474,
    "selection_accuracy": 0.8400556328233658,
    "solvable_selection_accuracy": 0.8753623188405797
  },
  "Qwen/Qwen2.5-72B-Instruct_verdicts": {
    "accuracy": 0.8425248611829868,
    "precision": 0.861743140071056,
    "recall": 0.9642385898433535,
    "f1": 0.9101142432879076,
    "true_positives": 57000,
    "true_negatives": 3238,
    "false_positives": 9145,
    "false_negatives": 2114,
    "total": 71497,
    "normalized_precision": 0.87017679523509,
    "normalized_recall": 0.9356542369176571,
    "selection_accuracy": 0.8442280945757997,
    "solvable_selection_accuracy": 0.8797101449275362
  },
  "SkyT1_verdicts": {
    "accuracy": 0.8628274722681142,
    "precision": 0.8806059364005875,
    "recall": 0.9675924188403078,
    "f1": 0.9220521440577947,
    "true_positives": 51563,
    "true_negatives": 3274,
    "false_positives": 6991,
    "false_negatives": 1727,
    "total": 63555,
    "normalized_precision": 0.8728838313095179,
    "normalized_recall": 0.8345839524259955,
    "selection_accuracy": 0.8442280945757997,
    "solvable_selection_accuracy": 0.8797101449275362
  },
  "WizardLM-2-8x22B_verdicts": {
    "accuracy": 0.8361910724703673,
    "precision": 0.8693966676687922,
    "recall": 0.9440235039430958,
    "f1": 0.9051745440767037,
    "true_positives": 54945,
    "true_negatives": 3820,
    "false_positives": 8254,
    "false_negatives": 3258,
    "total": 70277,
    "normalized_precision": 0.8697126395232206,
    "normalized_recall": 0.8965423806999291,
    "selection_accuracy": 0.8414464534075105,
    "solvable_selection_accuracy": 0.8768115942028986
  }
}