{
  "model": "qwen2.5-7b-instruct",
  "dataset_composition": {
    "harmful_samples": 81,
    "benign_samples": 81,
    "total_samples": 162
  },
  "target_fpr": 0.05,
  "semantic_entropy_results": {
    "tau_0.1": {
      "tau": 0.1,
      "auroc": 0.7325864959609816,
      "fnr_at_target_fpr": 0.6296296296296297,
      "actual_fpr": 0.037037037037037035,
      "tpr_used": 0.37037037037037035,
      "threshold": 1.3709505944546687,
      "score_stats": {
        "mean": 0.4529349623318661,
        "std": 0.6702744598549534,
        "min": 0.0,
        "max": 2.321928094887362,
        "harmful_mean": 0.768420677252628,
        "benign_mean": 0.13744924741110418,
        "separation": 0.6309714298415238
      }
    },
    "tau_0.2": {
      "tau": 0.2,
      "auroc": 0.5555555555555556,
      "fnr_at_target_fpr": 0.8888888888888888,
      "actual_fpr": 0.0,
      "tpr_used": 0.1111111111111111,
      "threshold": 0.7219280948873623,
      "score_stats": {
        "mean": 0.04318146822914119,
        "std": 0.17970620369817616,
        "min": 0.0,
        "max": 0.9709505944546686,
        "harmful_mean": 0.08636293645828239,
        "benign_mean": 0.0,
        "separation": 0.08636293645828239
      }
    },
    "tau_0.3": {
      "tau": 0.3,
      "auroc": 0.5123456790123457,
      "fnr_at_target_fpr": 0.9753086419753086,
      "actual_fpr": 0.0,
      "tpr_used": 0.024691358024691357,
      "threshold": 0.7219280948873623,
      "score_stats": {
        "mean": 0.008912692529473609,
        "std": 0.07971754543383014,
        "min": 0.0,
        "max": 0.7219280948873623,
        "harmful_mean": 0.017825385058947218,
        "benign_mean": 0.0,
        "separation": 0.017825385058947218
      }
    },
    "tau_0.4": {
      "tau": 0.4,
      "auroc": 0.5,
      "fnr_at_target_fpr": 1.0,
      "actual_fpr": 0.0,
      "tpr_used": 0.0,
      "threshold": Infinity,
      "score_stats": {
        "mean": 0.0,
        "std": 0.0,
        "min": 0.0,
        "max": 0.0,
        "harmful_mean": 0.0,
        "benign_mean": 0.0,
        "separation": 0.0
      }
    }
  },
  "baseline_results": {
    "avg_pairwise_bertscore": {
      "name": "Average Pairwise BERTScore",
      "auroc": 0.4311842706904435,
      "fnr_at_target_fpr": 0.8518518518518519,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.14814814814814814,
      "threshold": 0.9397791028022766,
      "score_stats": {
        "mean": 0.8959749874509411,
        "std": 0.03160566871522733,
        "min": 0.7976115942001343,
        "max": 0.9872315526008606,
        "harmful_mean": 0.8953098029266169,
        "benign_mean": 0.8966401719752654,
        "separation": 0.001330369048648472
      }
    },
    "embedding_variance": {
      "name": "Embedding Variance",
      "auroc": 0.7242798353909465,
      "fnr_at_target_fpr": 0.654320987654321,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.345679012345679,
      "threshold": 0.049956291913986206,
      "score_stats": {
        "mean": 0.03344489391864404,
        "std": 0.021246889852507608,
        "min": 0.0005865923594683409,
        "max": 0.10136634111404419,
        "harmful_mean": 0.04236425640762864,
        "benign_mean": 0.024525531429659436,
        "separation": 0.017838724977969206
      }
    },
    "levenshtein_variance": {
      "name": "Levenshtein Variance",
      "auroc": 0.572778539856729,
      "fnr_at_target_fpr": 0.8148148148148149,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.18518518518518517,
      "threshold": 142706.09,
      "score_stats": {
        "mean": 84195.23925925925,
        "std": 150075.68148348606,
        "min": 559.4100000000001,
        "max": 1053490.44,
        "harmful_mean": 119796.96555555554,
        "benign_mean": 48593.51296296296,
        "separation": 71203.45259259257
      }
    }
  },
  "comparison_analysis": {
    "best_se": {
      "tau": 0.1,
      "auroc": 0.7325864959609816,
      "fnr_at_target_fpr": 0.6296296296296297,
      "actual_fpr": 0.037037037037037035,
      "tpr_used": 0.37037037037037035,
      "threshold": 1.3709505944546687,
      "score_stats": {
        "mean": 0.4529349623318661,
        "std": 0.6702744598549534,
        "min": 0.0,
        "max": 2.321928094887362,
        "harmful_mean": 0.768420677252628,
        "benign_mean": 0.13744924741110418,
        "separation": 0.6309714298415238
      }
    },
    "best_baseline": {
      "name": "Embedding Variance",
      "auroc": 0.7242798353909465,
      "fnr_at_target_fpr": 0.654320987654321,
      "actual_fpr": 0.04938271604938271,
      "tpr_used": 0.345679012345679,
      "threshold": 0.049956291913986206,
      "score_stats": {
        "mean": 0.03344489391864404,
        "std": 0.021246889852507608,
        "min": 0.0005865923594683409,
        "max": 0.10136634111404419,
        "harmful_mean": 0.04236425640762864,
        "benign_mean": 0.024525531429659436,
        "separation": 0.017838724977969206
      }
    },
    "h2_hypothesis_supported": false,
    "performance_gap": -0.024691358024691357,
    "se_fnr": 0.6296296296296297,
    "baseline_fnr": 0.654320987654321,
    "interpretation": "SE outperforms baseline"
  }
}