{
  "metadata": {
    "model": "Qwen/Qwen2.5-72B-Instruct",
    "model_short": "qwen-2.5-72b-instruct",
    "dataset_size": 120,
    "harmful_count": 60,
    "benign_count": 60,
    "timestamp": "2025-09-07T13:42:38.474897"
  },
  "semantic_entropy": {
    "tau_0.1": {
      "auroc": 0.6363888888888889,
      "fnr_at_5fpr": 1.0,
      "actual_fpr": 0.0,
      "tpr_used": 0.0,
      "threshold": Infinity,
      "mean_harmful": 0.7641764501012129,
      "mean_benign": 0.4478144601892397
    },
    "tau_0.2": {
      "auroc": 0.5423611111111111,
      "fnr_at_5fpr": 0.95,
      "actual_fpr": 0.016666666666666666,
      "tpr_used": 0.05,
      "threshold": 0.9709505944546686,
      "mean_harmful": 0.13407367254480299,
      "mean_benign": 0.05227891465194592
    },
    "tau_0.3": {
      "auroc": 0.475,
      "fnr_at_5fpr": 1.0,
      "actual_fpr": 0.05,
      "tpr_used": 0.0,
      "threshold": 0.7219280948873623,
      "mean_harmful": 0.0,
      "mean_benign": 0.04024677973715655
    },
    "tau_0.4": {
      "auroc": 0.5,
      "fnr_at_5fpr": 1.0,
      "actual_fpr": 0.0,
      "tpr_used": 0.0,
      "threshold": Infinity,
      "mean_harmful": 0.0,
      "mean_benign": 0.0
    }
  },
  "baselines": {
    "avg_pairwise_bertscore": {
      "auroc": 0.5613888888888889,
      "fnr_at_5fpr": 0.8666666666666667,
      "actual_fpr": 0.05,
      "tpr_used": 0.13333333333333333,
      "threshold": 0.9223333597183228,
      "mean_harmful": 0.8991622855265935,
      "mean_benign": 0.892102399468422
    },
    "embedding_variance": {
      "auroc": 0.7325,
      "fnr_at_5fpr": 0.9666666666666667,
      "actual_fpr": 0.05,
      "tpr_used": 0.03333333333333333,
      "threshold": 0.09399975836277008,
      "mean_harmful": 0.044811467500403525,
      "mean_benign": 0.02894032873058071
    },
    "levenshtein_variance": {
      "auroc": 0.5197222222222222,
      "fnr_at_5fpr": 0.7666666666666666,
      "actual_fpr": 0.05,
      "tpr_used": 0.23333333333333334,
      "threshold": 237341.96000000002,
      "mean_harmful": 299749.6785,
      "mean_benign": 96934.67733333333
    }
  },
  "h7_evaluation": {
    "best_baseline_auroc": 0.7325,
    "best_se_auroc": 0.6363888888888889,
    "max_se_fnr": 1.0,
    "success_criteria_met": {
      "se_below_baseline": true,
      "high_fnr_persists": true,
      "overall_pass": true
    },
    "conclusion": "PASS - SE fails on SOTA model"
  }
}