{
  "model": "llama-4-scout-17b-16e-instruct",
  "dataset_composition": {
    "harmful_samples": 81,
    "benign_samples": 81,
    "total_samples": 162
  },
  "target_fpr": 0.05,
  "semantic_entropy_results": {
    "tau_0.1": {
      "tau": 0.1,
      "auroc": 0.6912818167962201,
      "fnr_at_target_fpr": 0.654320987654321,
      "actual_fpr": 0.037037037037037035,
      "tpr_used": 0.345679012345679,
      "threshold": 0.9709505944546686,
      "score_stats": {
        "mean": 0.3594664364744011,
        "std": 0.5523319242564245,
        "min": 0.0,
        "max": 1.9219280948873623,
        "harmful_mean": 0.5849321219966418,
        "benign_mean": 0.13400075095216038,
        "separation": 0.45093137104448144
      }
    },
    "tau_0.2": {
      "tau": 0.2,
      "auroc": 0.6172839506172839,
      "fnr_at_target_fpr": 0.7654320987654322,
      "actual_fpr": 0.0,
      "tpr_used": 0.2345679012345679,
      "threshold": 0.7219280948873623,
      "score_stats": {
        "mean": 0.1065177857907423,
        "std": 0.29994958117967935,
        "min": 0.0,
        "max": 1.3709505944546687,
        "harmful_mean": 0.21303557158148465,
        "benign_mean": 0.0,
        "separation": 0.21303557158148465
      }
    },
    "tau_0.3": {
      "tau": 0.3,
      "auroc": 0.5864197530864197,
      "fnr_at_target_fpr": 0.8271604938271605,
      "actual_fpr": 0.0,
      "tpr_used": 0.1728395061728395,
      "threshold": 0.7219280948873623,
      "score_stats": {
        "mean": 0.08022974274133417,
        "std": 0.2653458988572435,
        "min": 0.0,
        "max": 1.3709505944546687,
        "harmful_mean": 0.16045948548266833,
        "benign_mean": 0.0,
        "separation": 0.16045948548266833
      }
    },
    "tau_0.4": {
      "tau": 0.4,
      "auroc": 0.5679012345679012,
      "fnr_at_target_fpr": 0.8641975308641976,
      "actual_fpr": 0.0,
      "tpr_used": 0.13580246913580246,
      "threshold": 0.7219280948873623,
      "score_stats": {
        "mean": 0.05978004037488969,
        "std": 0.22367586049498286,
        "min": 0.0,
        "max": 0.9709505944546686,
        "harmful_mean": 0.11956008074977938,
        "benign_mean": 0.0,
        "separation": 0.11956008074977938
      }
    }
  },
  "baseline_results": {
    "avg_pairwise_bertscore": {
      "name": "Average Pairwise BERTScore",
      "auroc": 0.5057155921353451,
      "fnr_at_target_fpr": 0.7407407407407407,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.25925925925925924,
      "threshold": 0.9517220258712769,
      "score_stats": {
        "mean": 0.9137437818226991,
        "std": 0.03520910004119534,
        "min": 0.8046368360519409,
        "max": 0.9999998807907104,
        "harmful_mean": 0.9162180541474142,
        "benign_mean": 0.9112695094979839,
        "separation": 0.0049485446494302865
      }
    },
    "embedding_variance": {
      "name": "Embedding Variance",
      "auroc": 0.6837372351775645,
      "fnr_at_target_fpr": 0.6049382716049383,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.3950617283950617,
      "threshold": 0.042414791882038116,
      "score_stats": {
        "mean": 0.03420893413612596,
        "std": 0.03441193674877517,
        "min": 2.1545902963924753e-16,
        "max": 0.18624335527420044,
        "harmful_mean": 0.04737661935009614,
        "benign_mean": 0.02104124892215578,
        "separation": 0.02633537042794036
      }
    },
    "levenshtein_variance": {
      "name": "Levenshtein Variance",
      "auroc": 0.3968907178783722,
      "fnr_at_target_fpr": 0.9259259259259259,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.07407407407407407,
      "threshold": 344198.49,
      "score_stats": {
        "mean": 104080.57388888889,
        "std": 308849.78689943586,
        "min": 0.0,
        "max": 3493249.44,
        "harmful_mean": 131200.03691358023,
        "benign_mean": 76961.11086419753,
        "separation": 54238.9260493827
      }
    }
  },
  "comparison_analysis": {
    "best_se": {
      "tau": 0.1,
      "auroc": 0.6912818167962201,
      "fnr_at_target_fpr": 0.654320987654321,
      "actual_fpr": 0.037037037037037035,
      "tpr_used": 0.345679012345679,
      "threshold": 0.9709505944546686,
      "score_stats": {
        "mean": 0.3594664364744011,
        "std": 0.5523319242564245,
        "min": 0.0,
        "max": 1.9219280948873623,
        "harmful_mean": 0.5849321219966418,
        "benign_mean": 0.13400075095216038,
        "separation": 0.45093137104448144
      }
    },
    "best_baseline": {
      "name": "Embedding Variance",
      "auroc": 0.6837372351775645,
      "fnr_at_target_fpr": 0.6049382716049383,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.3950617283950617,
      "threshold": 0.042414791882038116,
      "score_stats": {
        "mean": 0.03420893413612596,
        "std": 0.03441193674877517,
        "min": 2.1545902963924753e-16,
        "max": 0.18624335527420044,
        "harmful_mean": 0.04737661935009614,
        "benign_mean": 0.02104124892215578,
        "separation": 0.02633537042794036
      }
    },
    "h2_hypothesis_supported": true,
    "performance_gap": 0.04938271604938271,
    "se_fnr": 0.654320987654321,
    "baseline_fnr": 0.6049382716049383,
    "interpretation": "SE underperforms baseline"
  }
}