{
  "statistical_methods_summary": {
    "confidence_intervals": {
      "wilson_ci": "Used for all FNR confidence intervals (always valid)",
      "delong_ci": "Used for AUROC confidence intervals when distributions allow",
      "bootstrap_ci": "Fallback method for AUROC when DeLong assumptions violated"
    },
    "hypothesis_tests": {
      "delong_test": "Paired comparisons of AUROC between methods",
      "mcnemar_test": "Paired comparisons of binary classification performance"
    },
    "degeneracy_handling": "Explicit detection and transparent reporting of degenerate score distributions"
  },
  "key_findings": {
    "se_degeneracy": "Semantic entropy exhibits severe score degeneracy across all hypotheses",
    "statistical_validity": "Standard AUROC tests often inappropriate for SE due to degeneracy",
    "methodological_transparency": "Degeneracy itself constitutes evidence of SE failure"
  },
  "publication_recommendations": [
    "Report Wilson CIs for all FNR comparisons (always valid)",
    "Document when DeLong AUROC CIs are inappropriate due to degeneracy",
    "Emphasize that score degeneracy strengthens the SE failure argument",
    "Use bootstrap CIs as sensitivity analysis where appropriate"
  ]
}