{
  "analysis_metadata": {
    "timestamp": "2025-09-08T10:17:28.466224",
    "hypotheses_processed": [
      "H1",
      "H2",
      "H5",
      "H7"
    ],
    "statistical_methods": [
      "Wilson CI",
      "DeLong test",
      "McNemar test"
    ],
    "degeneracy_handling": "Robust with methodological transparency",
    "statistical_libraries": {
      "scipy_version": "1.16.1",
      "statsmodels_available": true,
      "statsmodels_version": "0.14.5",
      "mlstatkit_available": false,
      "sklearn_version": "1.7.1"
    }
  },
  "hypothesis_results": {
    "H1": {
      "hypothesis": "H1",
      "dataset": "JailbreakBench",
      "models": {
        "llama4scout": {
          "model_name": "llama4scout",
          "metrics": {
            "semantic_entropy": {
              "tau_0.1": {
                "metric_name": "SE_tau_0.1",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.1",
                  "n_samples": 120,
                  "n_unique_scores": 6,
                  "unique_score_ratio": 0.05,
                  "n_zeros": "71",
                  "zero_proportion": 0.5916666666666667,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.9219280948873623
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      71,
                      0.5916666666666667
                    ],
                    [
                      0.7219280948873623,
                      22,
                      0.18333333333333332
                    ],
                    [
                      1.9219280948873623,
                      10,
                      0.08333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.6822094818819363,
                    "benign_mean": 0.2663525871967489,
                    "harmful_std": 0.6633929142843,
                    "benign_std": 0.5593003131121181
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.05,
                      "zero_proportion": 0.5916666666666667,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated"
                  ]
                },
                "auroc": 0.685138888888889,
                "delong_ci": [
                  0.5992380719204431,
                  0.7710397058573348
                ],
                "delong_ci_valid": true,
                "delong_method": "MLstatkit",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.2": {
                "metric_name": "SE_tau_0.2",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 120,
                  "n_unique_scores": 5,
                  "unique_score_ratio": 0.041666666666666664,
                  "n_zeros": "92",
                  "zero_proportion": 0.7666666666666667,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.9219280948873623
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      92,
                      0.7666666666666667
                    ],
                    [
                      0.7219280948873623,
                      19,
                      0.15833333333333333
                    ],
                    [
                      0.9709505944546686,
                      7,
                      0.058333333333333334
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.3486409045639191,
                    "benign_mean": 0.04812853965915749,
                    "harmful_std": 0.46002178073729433,
                    "benign_std": 0.18008050593032918
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.041666666666666664,
                      "zero_proportion": 0.7666666666666667,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 5/120 unique scores"
                  ]
                },
                "auroc": 0.6716666666666666,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution",
                "fnr": 0.85,
                "fnr_wilson_ci": [
                  0.7388541093022145,
                  0.9190255941983225
                ],
                "fnr_formatted": "0.850 [0.739, 0.919]"
              },
              "tau_0.3": {
                "metric_name": "SE_tau_0.3",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 120,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.025,
                  "n_zeros": "103",
                  "zero_proportion": 0.8583333333333333,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      103,
                      0.8583333333333333
                    ],
                    [
                      0.7219280948873623,
                      14,
                      0.11666666666666667
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.20496528361499525,
                    "benign_mean": 0.012032134914789373,
                    "harmful_std": 0.3435823866705323,
                    "benign_std": 0.09242058193851582
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.025,
                      "zero_proportion": 0.8583333333333333,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 85.8% of scores are exactly zero"
                  ]
                },
                "auroc": 0.6254166666666666,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution",
                "fnr": 0.7333333333333333,
                "fnr_wilson_ci": [
                  0.6099128690331636,
                  0.8286735999671945
                ],
                "fnr_formatted": "0.733 [0.610, 0.829]"
              },
              "tau_0.4": {
                "metric_name": "SE_tau_0.4",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.4",
                  "n_samples": 120,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.025,
                  "n_zeros": "110",
                  "zero_proportion": 0.9166666666666666,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      110,
                      0.9166666666666666
                    ],
                    [
                      0.7219280948873623,
                      8,
                      0.06666666666666667
                    ],
                    [
                      0.9709505944546686,
                      2,
                      0.016666666666666666
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.12862209913347059,
                    "benign_mean": 0.0,
                    "harmful_std": 0.29046838164463784,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.025,
                      "zero_proportion": 0.9166666666666666,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 91.7% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5833333333333334,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution",
                "fnr": 0.8333333333333334,
                "fnr_wilson_ci": [
                  0.7196838683638547,
                  0.9068682302080855
                ],
                "fnr_formatted": "0.833 [0.720, 0.907]"
              }
            },
            "BERTScore": {
              "metric_name": "BERTScore",
              "distribution_analysis": {
                "metric_name": "BERTScore",
                "n_samples": 120,
                "n_unique_scores": 119,
                "unique_score_ratio": 0.9916666666666667,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8550626635551453,
                  1.0
                ],
                "most_common_values": [
                  [
                    1.0,
                    2,
                    0.016666666666666666
                  ],
                  [
                    0.8774212598800659,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.8977276682853699,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.9341231316328049,
                  "benign_mean": 0.9019054690996806,
                  "harmful_std": 0.03682057098558438,
                  "benign_std": 0.023225074264305895
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.9916666666666667,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7672222222222222,
              "delong_ci": [
                0.6799285830454298,
                0.8545158613990147
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.6,
              "fnr_wilson_ci": [
                0.4736605349204075,
                0.7143050946511744
              ],
              "fnr_formatted": "0.600 [0.474, 0.714]"
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  1.6051951976826795e-16,
                  0.13768619298934937
                ],
                "most_common_values": [
                  [
                    0.039791539311409,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.006595076061785221,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.030444130301475525,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.050028230525398044,
                  "benign_mean": 0.025817701127380133,
                  "harmful_std": 0.03899043709875208,
                  "benign_std": 0.01946588307367237
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6536111111111111,
              "delong_ci": [
                0.5492777531588632,
                0.7579444690633591
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.6666666666666666,
              "fnr_wilson_ci": [
                0.5405686645211968,
                0.7727073847647731
              ],
              "fnr_formatted": "0.667 [0.541, 0.773]"
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance",
                "n_samples": 120,
                "n_unique_scores": 118,
                "unique_score_ratio": 0.9833333333333333,
                "n_zeros": "3",
                "zero_proportion": 0.025,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0,
                  3928112.16
                ],
                "most_common_values": [
                  [
                    0.0,
                    3,
                    0.025
                  ],
                  [
                    76501.41,
                    1,
                    0.008333333333333333
                  ],
                  [
                    23402.25,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 72934.05733333335,
                  "benign_mean": 118133.1995,
                  "harmful_std": 175353.4648079683,
                  "benign_std": 503168.0269873976
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.9833333333333333,
                    "zero_proportion": 0.025,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.2891666666666666,
              "delong_ci": [
                0.1905776759733993,
                0.38775565735993406
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.8833333333333333,
              "fnr_wilson_ci": [
                0.7782121197582914,
                0.9423227935994396
              ],
              "fnr_formatted": "0.883 [0.778, 0.942]"
            }
          },
          "paired_comparisons": {
            "SE_vs_BERTScore": {
              "metric1_name": "SE_tau_0.3",
              "metric2_name": "BERTScore",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 120,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.025,
                  "n_zeros": "103",
                  "zero_proportion": 0.8583333333333333,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      103,
                      0.8583333333333333
                    ],
                    [
                      0.7219280948873623,
                      14,
                      0.11666666666666667
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.20496528361499525,
                    "benign_mean": 0.012032134914789373,
                    "harmful_std": 0.3435823866705323,
                    "benign_std": 0.09242058193851582
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.025,
                      "zero_proportion": 0.8583333333333333,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 85.8% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "BERTScore",
                  "n_samples": 120,
                  "n_unique_scores": 119,
                  "unique_score_ratio": 0.9916666666666667,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.8550626635551453,
                    1.0
                  ],
                  "most_common_values": [
                    [
                      1.0,
                      2,
                      0.016666666666666666
                    ],
                    [
                      0.8774212598800659,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.8977276682853699,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.9341231316328049,
                    "benign_mean": 0.9019054690996806,
                    "harmful_std": 0.03682057098558438,
                    "benign_std": 0.023225074264305895
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 0.9916666666666667,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6254166666666666,
              "auroc2": 0.7672222222222222,
              "auroc_difference": 0.14180555555555563,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.3 distribution too degenerate"
                ]
              }
            },
            "SE_vs_EmbeddingVariance": {
              "metric1_name": "SE_tau_0.3",
              "metric2_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 120,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.025,
                  "n_zeros": "103",
                  "zero_proportion": 0.8583333333333333,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      103,
                      0.8583333333333333
                    ],
                    [
                      0.7219280948873623,
                      14,
                      0.11666666666666667
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.20496528361499525,
                    "benign_mean": 0.012032134914789373,
                    "harmful_std": 0.3435823866705323,
                    "benign_std": 0.09242058193851582
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.025,
                      "zero_proportion": 0.8583333333333333,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 85.8% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "EmbeddingVariance",
                  "n_samples": 120,
                  "n_unique_scores": 120,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    1.6051951976826795e-16,
                    0.13768619298934937
                  ],
                  "most_common_values": [
                    [
                      0.039791539311409,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.006595076061785221,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.030444130301475525,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.050028230525398044,
                    "benign_mean": 0.025817701127380133,
                    "harmful_std": 0.03899043709875208,
                    "benign_std": 0.01946588307367237
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6254166666666666,
              "auroc2": 0.6536111111111111,
              "auroc_difference": 0.028194444444444522,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.3 distribution too degenerate"
                ]
              }
            },
            "SE_vs_LevenshteinVariance": {
              "metric1_name": "SE_tau_0.3",
              "metric2_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 120,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.025,
                  "n_zeros": "103",
                  "zero_proportion": 0.8583333333333333,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      103,
                      0.8583333333333333
                    ],
                    [
                      0.7219280948873623,
                      14,
                      0.11666666666666667
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.20496528361499525,
                    "benign_mean": 0.012032134914789373,
                    "harmful_std": 0.3435823866705323,
                    "benign_std": 0.09242058193851582
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.025,
                      "zero_proportion": 0.8583333333333333,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 85.8% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "LevenshteinVariance",
                  "n_samples": 120,
                  "n_unique_scores": 118,
                  "unique_score_ratio": 0.9833333333333333,
                  "n_zeros": "3",
                  "zero_proportion": 0.025,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    3928112.16
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      3,
                      0.025
                    ],
                    [
                      76501.41,
                      1,
                      0.008333333333333333
                    ],
                    [
                      23402.25,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 72934.05733333335,
                    "benign_mean": 118133.1995,
                    "harmful_std": 175353.4648079683,
                    "benign_std": 503168.0269873976
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 0.9833333333333333,
                      "zero_proportion": 0.025,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6254166666666666,
              "auroc2": 0.2891666666666666,
              "auroc_difference": -0.33625,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.3 distribution too degenerate"
                ]
              }
            }
          }
        },
        "qwen25": {
          "model_name": "qwen25",
          "metrics": {
            "semantic_entropy": {
              "tau_0.1": {
                "metric_name": "SE_tau_0.1",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.1",
                  "n_samples": 120,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.058333333333333334,
                  "n_zeros": "50",
                  "zero_proportion": 0.4166666666666667,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      50,
                      0.4166666666666667
                    ],
                    [
                      0.7219280948873623,
                      23,
                      0.19166666666666668
                    ],
                    [
                      1.9219280948873623,
                      12,
                      0.1
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.9889810591422613,
                    "benign_mean": 0.5387921764226417,
                    "harmful_std": 0.6536895774398096,
                    "benign_std": 0.8113798511037924
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.058333333333333334,
                      "zero_proportion": 0.4166666666666667,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated"
                  ]
                },
                "auroc": 0.6901388888888889,
                "delong_ci": [
                  0.5937596963175698,
                  0.786518081460208
                ],
                "delong_ci_valid": true,
                "delong_method": "MLstatkit",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.2": {
                "metric_name": "SE_tau_0.2",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 120,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.058333333333333334,
                  "n_zeros": "102",
                  "zero_proportion": 0.85,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      102,
                      0.85
                    ],
                    [
                      0.7219280948873623,
                      11,
                      0.09166666666666666
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.1514712757077151,
                    "benign_mean": 0.14837531939631404,
                    "harmful_std": 0.33060107374824543,
                    "benign_std": 0.4580173624953092
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.058333333333333334,
                      "zero_proportion": 0.85,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated",
                    "HIGH ZERO CONCENTRATION: 85.0% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5290277777777778,
                "delong_ci": [
                  0.4644240999669153,
                  0.5936314555886405
                ],
                "delong_ci_valid": true,
                "delong_method": "MLstatkit",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution",
                "fnr": 0.9833333333333333,
                "fnr_wilson_ci": [
                  0.9114487027240993,
                  0.9970518402052136
                ],
                "fnr_formatted": "0.983 [0.911, 0.997]"
              },
              "tau_0.3": {
                "metric_name": "SE_tau_0.3",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 120,
                  "n_unique_scores": 2,
                  "unique_score_ratio": 0.016666666666666666,
                  "n_zeros": "116",
                  "zero_proportion": 0.9666666666666667,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      116,
                      0.9666666666666667
                    ],
                    [
                      0.9709505944546686,
                      4,
                      0.03333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.01618250990757781,
                    "benign_mean": 0.04854752972273343,
                    "harmful_std": 0.12430021716643293,
                    "benign_std": 0.2116137760199322
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.016666666666666666,
                      "zero_proportion": 0.9666666666666667,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 2/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 96.7% of scores are exactly zero"
                  ]
                },
                "auroc": 0.4833333333333333,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution",
                "fnr": 0.9833333333333333,
                "fnr_wilson_ci": [
                  0.9114487027240993,
                  0.9970518402052136
                ],
                "fnr_formatted": "0.983 [0.911, 0.997]"
              },
              "tau_0.4": {
                "metric_name": "SE_tau_0.4",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.4",
                  "n_samples": 120,
                  "n_unique_scores": 1,
                  "unique_score_ratio": 0.008333333333333333,
                  "n_zeros": "120",
                  "zero_proportion": 1.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.0
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      120,
                      1.0
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.0,
                    "benign_mean": 0.0,
                    "harmful_std": 0.0,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.008333333333333333,
                      "zero_proportion": 1.0,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 1/120 unique scores",
                    "HIGH ZERO CONCENTRATION: 100.0% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              }
            },
            "BERTScore": {
              "metric_name": "BERTScore",
              "distribution_analysis": {
                "metric_name": "BERTScore",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8301523327827454,
                  0.9510933756828308
                ],
                "most_common_values": [
                  [
                    0.9252229928970337,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.9051238894462585,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.8570109605789185,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.8914870947599411,
                  "benign_mean": 0.8831479062636693,
                  "harmful_std": 0.020067755724831906,
                  "benign_std": 0.019615868369943636
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.615,
              "delong_ci": [
                0.5139625699477964,
                0.716037430052204
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.8666666666666667,
              "fnr_wilson_ci": [
                0.7583484032350039,
                0.9308589051941303
              ],
              "fnr_formatted": "0.867 [0.758, 0.931]"
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.00738135352730751,
                  0.1524171233177185
                ],
                "most_common_values": [
                  [
                    0.033994417637586594,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.009308574721217155,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.040656767785549164,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.05071117606324454,
                  "benign_mean": 0.037843278034900625,
                  "harmful_std": 0.021019760002761234,
                  "benign_std": 0.03125346554359996
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7205555555555556,
              "delong_ci": [
                0.6247134953110518,
                0.8163976158000594
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.9666666666666667,
              "fnr_wilson_ci": [
                0.886362257256914,
                0.9908106807438021
              ],
              "fnr_formatted": "0.967 [0.886, 0.991]"
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  662.21,
                  1897289.44
                ],
                "most_common_values": [
                  [
                    4910.01,
                    1,
                    0.008333333333333333
                  ],
                  [
                    11272.89,
                    1,
                    0.008333333333333333
                  ],
                  [
                    51571.89,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 155062.19700000004,
                  "benign_mean": 92086.1705,
                  "harmful_std": 238264.1354793874,
                  "benign_std": 255303.78281650477
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6013888888888889,
              "delong_ci": [
                0.49848269967521974,
                0.7042950781025578
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "fnr": 0.7666666666666667,
              "fnr_wilson_ci": [
                0.6456372962239805,
                0.8556043826335717
              ],
              "fnr_formatted": "0.767 [0.646, 0.856]"
            }
          },
          "paired_comparisons": {
            "SE_vs_BERTScore": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "BERTScore",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 120,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.058333333333333334,
                  "n_zeros": "102",
                  "zero_proportion": 0.85,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      102,
                      0.85
                    ],
                    [
                      0.7219280948873623,
                      11,
                      0.09166666666666666
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.1514712757077151,
                    "benign_mean": 0.14837531939631404,
                    "harmful_std": 0.33060107374824543,
                    "benign_std": 0.4580173624953092
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.058333333333333334,
                      "zero_proportion": 0.85,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated",
                    "HIGH ZERO CONCENTRATION: 85.0% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "BERTScore",
                  "n_samples": 120,
                  "n_unique_scores": 120,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.8301523327827454,
                    0.9510933756828308
                  ],
                  "most_common_values": [
                    [
                      0.9252229928970337,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.9051238894462585,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.8570109605789185,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.8914870947599411,
                    "benign_mean": 0.8831479062636693,
                    "harmful_std": 0.020067755724831906,
                    "benign_std": 0.019615868369943636
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5290277777777778,
              "auroc2": 0.615,
              "auroc_difference": 0.08597222222222223,
              "delong_test": {
                "valid": false,
                "error": "name 'delong_result' is not defined"
              }
            },
            "SE_vs_EmbeddingVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 120,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.058333333333333334,
                  "n_zeros": "102",
                  "zero_proportion": 0.85,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      102,
                      0.85
                    ],
                    [
                      0.7219280948873623,
                      11,
                      0.09166666666666666
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.1514712757077151,
                    "benign_mean": 0.14837531939631404,
                    "harmful_std": 0.33060107374824543,
                    "benign_std": 0.4580173624953092
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.058333333333333334,
                      "zero_proportion": 0.85,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated",
                    "HIGH ZERO CONCENTRATION: 85.0% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "EmbeddingVariance",
                  "n_samples": 120,
                  "n_unique_scores": 120,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.00738135352730751,
                    0.1524171233177185
                  ],
                  "most_common_values": [
                    [
                      0.033994417637586594,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.009308574721217155,
                      1,
                      0.008333333333333333
                    ],
                    [
                      0.040656767785549164,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.05071117606324454,
                    "benign_mean": 0.037843278034900625,
                    "harmful_std": 0.021019760002761234,
                    "benign_std": 0.03125346554359996
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5290277777777778,
              "auroc2": 0.7205555555555556,
              "auroc_difference": 0.19152777777777785,
              "delong_test": {
                "valid": false,
                "error": "name 'delong_result' is not defined"
              }
            },
            "SE_vs_LevenshteinVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 120,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.058333333333333334,
                  "n_zeros": "102",
                  "zero_proportion": 0.85,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      102,
                      0.85
                    ],
                    [
                      0.7219280948873623,
                      11,
                      0.09166666666666666
                    ],
                    [
                      0.9709505944546686,
                      3,
                      0.025
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.1514712757077151,
                    "benign_mean": 0.14837531939631404,
                    "harmful_std": 0.33060107374824543,
                    "benign_std": 0.4580173624953092
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.058333333333333334,
                      "zero_proportion": 0.85,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": [
                    "MODERATE DEGENERACY: DeLong test assumptions may be violated",
                    "HIGH ZERO CONCENTRATION: 85.0% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "LevenshteinVariance",
                  "n_samples": 120,
                  "n_unique_scores": 120,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    662.21,
                    1897289.44
                  ],
                  "most_common_values": [
                    [
                      4910.01,
                      1,
                      0.008333333333333333
                    ],
                    [
                      11272.89,
                      1,
                      0.008333333333333333
                    ],
                    [
                      51571.89,
                      1,
                      0.008333333333333333
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 155062.19700000004,
                    "benign_mean": 92086.1705,
                    "harmful_std": 238264.1354793874,
                    "benign_std": 255303.78281650477
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5290277777777778,
              "auroc2": 0.6013888888888889,
              "auroc_difference": 0.0723611111111111,
              "delong_test": {
                "valid": false,
                "error": "name 'delong_result' is not defined"
              }
            }
          }
        }
      }
    },
    "H2": {
      "hypothesis": "H2",
      "dataset": "HarmBench",
      "models": {
        "llama-4-scout-17b-16e-instruct": {
          "model_name": "llama-4-scout-17b-16e-instruct",
          "metrics": {
            "semantic_entropy": {
              "tau_0.1": {
                "metric_name": "SE_tau_0.1",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.1",
                  "n_samples": 162,
                  "n_unique_scores": 6,
                  "unique_score_ratio": 0.037037037037037035,
                  "n_zeros": "107",
                  "zero_proportion": 0.6604938271604939,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.9219280948873623
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      107,
                      0.6604938271604939
                    ],
                    [
                      0.7219280948873623,
                      24,
                      0.14814814814814814
                    ],
                    [
                      0.9709505944546686,
                      13,
                      0.08024691358024691
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.5849321219966418,
                    "benign_mean": 0.13400075095216038,
                    "harmful_std": 0.6486399012997586,
                    "benign_std": 0.2962057339948386
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.037037037037037035,
                      "zero_proportion": 0.6604938271604939,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 6/162 unique scores"
                  ]
                },
                "auroc": 0.6912818167962201,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.2": {
                "metric_name": "SE_tau_0.2",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 4,
                  "unique_score_ratio": 0.024691358024691357,
                  "n_zeros": "143",
                  "zero_proportion": 0.8827160493827161,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.3709505944546687
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      143,
                      0.8827160493827161
                    ],
                    [
                      0.9709505944546686,
                      9,
                      0.05555555555555555
                    ],
                    [
                      0.7219280948873623,
                      8,
                      0.04938271604938271
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.21303557158148465,
                    "benign_mean": 0.0,
                    "harmful_std": 0.396544354543354,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.024691358024691357,
                      "zero_proportion": 0.8827160493827161,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 4/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
                  ]
                },
                "auroc": 0.6172839506172839,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.3": {
                "metric_name": "SE_tau_0.3",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 162,
                  "n_unique_scores": 4,
                  "unique_score_ratio": 0.024691358024691357,
                  "n_zeros": "148",
                  "zero_proportion": 0.9135802469135802,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.3709505944546687
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      148,
                      0.9135802469135802
                    ],
                    [
                      0.9709505944546686,
                      9,
                      0.05555555555555555
                    ],
                    [
                      0.7219280948873623,
                      4,
                      0.024691358024691357
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.16045948548266833,
                    "benign_mean": 0.0,
                    "harmful_std": 0.3576915834067607,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.024691358024691357,
                      "zero_proportion": 0.9135802469135802,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 4/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 91.4% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5864197530864197,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.4": {
                "metric_name": "SE_tau_0.4",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.4",
                  "n_samples": 162,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.018518518518518517,
                  "n_zeros": "151",
                  "zero_proportion": 0.9320987654320988,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      151,
                      0.9320987654320988
                    ],
                    [
                      0.9709505944546686,
                      7,
                      0.043209876543209874
                    ],
                    [
                      0.7219280948873623,
                      4,
                      0.024691358024691357
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.11956008074977938,
                    "benign_mean": 0.0,
                    "harmful_std": 0.30481875710312706,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.018518518518518517,
                      "zero_proportion": 0.9320987654320988,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 93.2% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5679012345679012,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              }
            },
            "BERTScore": {
              "metric_name": "BERTScore",
              "distribution_analysis": {
                "metric_name": "BERTScore",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8046368360519409,
                  0.9999998807907104
                ],
                "most_common_values": [
                  [
                    0.9045571088790894,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.9440921545028687,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.9056106805801392,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.9162180541474142,
                  "benign_mean": 0.9112695094979839,
                  "harmful_std": 0.04465958143496275,
                  "benign_std": 0.021740266566949707
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.5057155921353451,
              "delong_ci": [
                0.41154585031946583,
                0.5998853339512245
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  2.1545902963924753e-16,
                  0.18624335527420044
                ],
                "most_common_values": [
                  [
                    0.04219241812825203,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.0086062615737319,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.009998868219554424,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.04737661935009614,
                  "benign_mean": 0.02104124892215578,
                  "harmful_std": 0.043628091709274765,
                  "benign_std": 0.010870902797012704
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6837372351775645,
              "delong_ci": [
                0.5985452051316781,
                0.7689292652234506
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "1",
                "zero_proportion": 0.006172839506172839,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0,
                  3493249.44
                ],
                "most_common_values": [
                  [
                    8481.490000000002,
                    1,
                    0.006172839506172839
                  ],
                  [
                    127144.49000000002,
                    1,
                    0.006172839506172839
                  ],
                  [
                    210873.03999999998,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 131200.03691358023,
                  "benign_mean": 76961.11086419753,
                  "harmful_std": 422506.7934899724,
                  "benign_std": 103891.5811843312
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.006172839506172839,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.3968907178783722,
              "delong_ci": [
                0.3072126391170881,
                0.4865687966396564
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          },
          "paired_comparisons": {
            "SE_vs_BERTScore": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "BERTScore",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 4,
                  "unique_score_ratio": 0.024691358024691357,
                  "n_zeros": "143",
                  "zero_proportion": 0.8827160493827161,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.3709505944546687
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      143,
                      0.8827160493827161
                    ],
                    [
                      0.9709505944546686,
                      9,
                      0.05555555555555555
                    ],
                    [
                      0.7219280948873623,
                      8,
                      0.04938271604938271
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.21303557158148465,
                    "benign_mean": 0.0,
                    "harmful_std": 0.396544354543354,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.024691358024691357,
                      "zero_proportion": 0.8827160493827161,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 4/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "BERTScore",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.8046368360519409,
                    0.9999998807907104
                  ],
                  "most_common_values": [
                    [
                      0.9045571088790894,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.9440921545028687,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.9056106805801392,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.9162180541474142,
                    "benign_mean": 0.9112695094979839,
                    "harmful_std": 0.04465958143496275,
                    "benign_std": 0.021740266566949707
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6172839506172839,
              "auroc2": 0.5057155921353451,
              "auroc_difference": -0.1115683584819388,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            },
            "SE_vs_EmbeddingVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 4,
                  "unique_score_ratio": 0.024691358024691357,
                  "n_zeros": "143",
                  "zero_proportion": 0.8827160493827161,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.3709505944546687
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      143,
                      0.8827160493827161
                    ],
                    [
                      0.9709505944546686,
                      9,
                      0.05555555555555555
                    ],
                    [
                      0.7219280948873623,
                      8,
                      0.04938271604938271
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.21303557158148465,
                    "benign_mean": 0.0,
                    "harmful_std": 0.396544354543354,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.024691358024691357,
                      "zero_proportion": 0.8827160493827161,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 4/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "EmbeddingVariance",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    2.1545902963924753e-16,
                    0.18624335527420044
                  ],
                  "most_common_values": [
                    [
                      0.04219241812825203,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.0086062615737319,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.009998868219554424,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.04737661935009614,
                    "benign_mean": 0.02104124892215578,
                    "harmful_std": 0.043628091709274765,
                    "benign_std": 0.010870902797012704
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6172839506172839,
              "auroc2": 0.6837372351775645,
              "auroc_difference": 0.06645328456028055,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            },
            "SE_vs_LevenshteinVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 4,
                  "unique_score_ratio": 0.024691358024691357,
                  "n_zeros": "143",
                  "zero_proportion": 0.8827160493827161,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    1.3709505944546687
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      143,
                      0.8827160493827161
                    ],
                    [
                      0.9709505944546686,
                      9,
                      0.05555555555555555
                    ],
                    [
                      0.7219280948873623,
                      8,
                      0.04938271604938271
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.21303557158148465,
                    "benign_mean": 0.0,
                    "harmful_std": 0.396544354543354,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.024691358024691357,
                      "zero_proportion": 0.8827160493827161,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 4/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 88.3% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "LevenshteinVariance",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "1",
                  "zero_proportion": 0.006172839506172839,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    3493249.44
                  ],
                  "most_common_values": [
                    [
                      8481.490000000002,
                      1,
                      0.006172839506172839
                    ],
                    [
                      127144.49000000002,
                      1,
                      0.006172839506172839
                    ],
                    [
                      210873.03999999998,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 131200.03691358023,
                    "benign_mean": 76961.11086419753,
                    "harmful_std": 422506.7934899724,
                    "benign_std": 103891.5811843312
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.006172839506172839,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.6172839506172839,
              "auroc2": 0.3968907178783722,
              "auroc_difference": -0.2203932327389117,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            }
          }
        },
        "qwen2.5-7b-instruct": {
          "model_name": "qwen2.5-7b-instruct",
          "metrics": {
            "semantic_entropy": {
              "tau_0.1": {
                "metric_name": "SE_tau_0.1",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.1",
                  "n_samples": 162,
                  "n_unique_scores": 7,
                  "unique_score_ratio": 0.043209876543209874,
                  "n_zeros": "105",
                  "zero_proportion": 0.6481481481481481,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    2.321928094887362
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      105,
                      0.6481481481481481
                    ],
                    [
                      1.3709505944546687,
                      14,
                      0.08641975308641975
                    ],
                    [
                      0.7219280948873623,
                      13,
                      0.08024691358024691
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.768420677252628,
                    "benign_mean": 0.13744924741110418,
                    "harmful_std": 0.7558155206794037,
                    "benign_std": 0.35807279864532093
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.043209876543209874,
                      "zero_proportion": 0.6481481481481481,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 7/162 unique scores"
                  ]
                },
                "auroc": 0.7325864959609816,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.2": {
                "metric_name": "SE_tau_0.2",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.018518518518518517,
                  "n_zeros": "153",
                  "zero_proportion": 0.9444444444444444,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      153,
                      0.9444444444444444
                    ],
                    [
                      0.7219280948873623,
                      7,
                      0.043209876543209874
                    ],
                    [
                      0.9709505944546686,
                      2,
                      0.012345679012345678
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.08636293645828239,
                    "benign_mean": 0.0,
                    "harmful_std": 0.24669690086900586,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.018518518518518517,
                      "zero_proportion": 0.9444444444444444,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5555555555555556,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.3": {
                "metric_name": "SE_tau_0.3",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.3",
                  "n_samples": 162,
                  "n_unique_scores": 2,
                  "unique_score_ratio": 0.012345679012345678,
                  "n_zeros": "160",
                  "zero_proportion": 0.9876543209876543,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.7219280948873623
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      160,
                      0.9876543209876543
                    ],
                    [
                      0.7219280948873623,
                      2,
                      0.012345679012345678
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.017825385058947218,
                    "benign_mean": 0.0,
                    "harmful_std": 0.11203080792237322,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.012345679012345678,
                      "zero_proportion": 0.9876543209876543,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 2/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 98.8% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5123456790123457,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              },
              "tau_0.4": {
                "metric_name": "SE_tau_0.4",
                "distribution_analysis": {
                  "metric_name": "SE_tau_0.4",
                  "n_samples": 162,
                  "n_unique_scores": 1,
                  "unique_score_ratio": 0.006172839506172839,
                  "n_zeros": "162",
                  "zero_proportion": 1.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.0
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      162,
                      1.0
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.0,
                    "benign_mean": 0.0,
                    "harmful_std": 0.0,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.006172839506172839,
                      "zero_proportion": 1.0,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 1/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 100.0% of scores are exactly zero"
                  ]
                },
                "auroc": 0.5,
                "delong_ci_valid": false,
                "delong_ci_error": "Distribution too degenerate for DeLong method",
                "bootstrap_ci": [
                  NaN,
                  NaN
                ],
                "bootstrap_ci_valid": true,
                "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
              }
            },
            "BERTScore": {
              "metric_name": "BERTScore",
              "distribution_analysis": {
                "metric_name": "BERTScore",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.7976115942001343,
                  0.9872315526008606
                ],
                "most_common_values": [
                  [
                    0.9213066101074219,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.9030462503433228,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.9509181976318359,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.8953098029266169,
                  "benign_mean": 0.8966401719752654,
                  "harmful_std": 0.03916996932064665,
                  "benign_std": 0.021509652539969077
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.4311842706904435,
              "delong_ci": [
                0.339122449788644,
                0.523246091592243
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0005865923594683409,
                  0.10136634111404419
                ],
                "most_common_values": [
                  [
                    0.015436764806509018,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.021777819842100143,
                    1,
                    0.006172839506172839
                  ],
                  [
                    0.024993762373924255,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.04236425640762864,
                  "benign_mean": 0.024525531429659436,
                  "harmful_std": 0.024492743655985215,
                  "benign_std": 0.01199400310953629
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7242798353909465,
              "delong_ci": [
                0.6426507738257684,
                0.8059088969561247
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance",
                "n_samples": 162,
                "n_unique_scores": 162,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  559.4100000000001,
                  1053490.44
                ],
                "most_common_values": [
                  [
                    28689.840000000004,
                    1,
                    0.006172839506172839
                  ],
                  [
                    137441.00999999998,
                    1,
                    0.006172839506172839
                  ],
                  [
                    8672.2,
                    1,
                    0.006172839506172839
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 119796.96555555554,
                  "benign_mean": 48593.51296296296,
                  "harmful_std": 197919.90692332902,
                  "benign_std": 57776.85486744233
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.572778539856729,
              "delong_ci": [
                0.48235181988623355,
                0.6632052598272247
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          },
          "paired_comparisons": {
            "SE_vs_BERTScore": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "BERTScore",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.018518518518518517,
                  "n_zeros": "153",
                  "zero_proportion": 0.9444444444444444,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      153,
                      0.9444444444444444
                    ],
                    [
                      0.7219280948873623,
                      7,
                      0.043209876543209874
                    ],
                    [
                      0.9709505944546686,
                      2,
                      0.012345679012345678
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.08636293645828239,
                    "benign_mean": 0.0,
                    "harmful_std": 0.24669690086900586,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.018518518518518517,
                      "zero_proportion": 0.9444444444444444,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "BERTScore",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.7976115942001343,
                    0.9872315526008606
                  ],
                  "most_common_values": [
                    [
                      0.9213066101074219,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.9030462503433228,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.9509181976318359,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.8953098029266169,
                    "benign_mean": 0.8966401719752654,
                    "harmful_std": 0.03916996932064665,
                    "benign_std": 0.021509652539969077
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5555555555555556,
              "auroc2": 0.4311842706904435,
              "auroc_difference": -0.12437128486511206,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            },
            "SE_vs_EmbeddingVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "EmbeddingVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.018518518518518517,
                  "n_zeros": "153",
                  "zero_proportion": 0.9444444444444444,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      153,
                      0.9444444444444444
                    ],
                    [
                      0.7219280948873623,
                      7,
                      0.043209876543209874
                    ],
                    [
                      0.9709505944546686,
                      2,
                      0.012345679012345678
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.08636293645828239,
                    "benign_mean": 0.0,
                    "harmful_std": 0.24669690086900586,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.018518518518518517,
                      "zero_proportion": 0.9444444444444444,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "EmbeddingVariance",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0005865923594683409,
                    0.10136634111404419
                  ],
                  "most_common_values": [
                    [
                      0.015436764806509018,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.021777819842100143,
                      1,
                      0.006172839506172839
                    ],
                    [
                      0.024993762373924255,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.04236425640762864,
                    "benign_mean": 0.024525531429659436,
                    "harmful_std": 0.024492743655985215,
                    "benign_std": 0.01199400310953629
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5555555555555556,
              "auroc2": 0.7242798353909465,
              "auroc_difference": 0.16872427983539096,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            },
            "SE_vs_LevenshteinVariance": {
              "metric1_name": "SE_tau_0.2",
              "metric2_name": "LevenshteinVariance",
              "distribution_analysis": {
                "metric1": {
                  "metric_name": "SE_tau_0.2",
                  "n_samples": 162,
                  "n_unique_scores": 3,
                  "unique_score_ratio": 0.018518518518518517,
                  "n_zeros": "153",
                  "zero_proportion": 0.9444444444444444,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    0.0,
                    0.9709505944546686
                  ],
                  "most_common_values": [
                    [
                      0.0,
                      153,
                      0.9444444444444444
                    ],
                    [
                      0.7219280948873623,
                      7,
                      0.043209876543209874
                    ],
                    [
                      0.9709505944546686,
                      2,
                      0.012345679012345678
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 0.08636293645828239,
                    "benign_mean": 0.0,
                    "harmful_std": 0.24669690086900586,
                    "benign_std": 0.0
                  },
                  "is_degenerate": {
                    "severe": true,
                    "moderate": true,
                    "mild": true,
                    "details": {
                      "unique_score_ratio": 0.018518518518518517,
                      "zero_proportion": 0.9444444444444444,
                      "effective_discrimination": false
                    }
                  },
                  "delong_valid": false,
                  "statistical_warnings": [
                    "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                    "Only 3/162 unique scores",
                    "HIGH ZERO CONCENTRATION: 94.4% of scores are exactly zero"
                  ]
                },
                "metric2": {
                  "metric_name": "LevenshteinVariance",
                  "n_samples": 162,
                  "n_unique_scores": 162,
                  "unique_score_ratio": 1.0,
                  "n_zeros": "0",
                  "zero_proportion": 0.0,
                  "n_infinite": "0",
                  "n_nan": "0",
                  "score_range": [
                    559.4100000000001,
                    1053490.44
                  ],
                  "most_common_values": [
                    [
                      28689.840000000004,
                      1,
                      0.006172839506172839
                    ],
                    [
                      137441.00999999998,
                      1,
                      0.006172839506172839
                    ],
                    [
                      8672.2,
                      1,
                      0.006172839506172839
                    ]
                  ],
                  "class_separation": {
                    "harmful_mean": 119796.96555555554,
                    "benign_mean": 48593.51296296296,
                    "harmful_std": 197919.90692332902,
                    "benign_std": 57776.85486744233
                  },
                  "is_degenerate": {
                    "severe": "False",
                    "moderate": "False",
                    "mild": "False",
                    "details": {
                      "unique_score_ratio": 1.0,
                      "zero_proportion": 0.0,
                      "effective_discrimination": "True"
                    }
                  },
                  "delong_valid": true,
                  "statistical_warnings": []
                }
              },
              "auroc1": 0.5555555555555556,
              "auroc2": 0.572778539856729,
              "auroc_difference": 0.017222984301173416,
              "delong_test": {
                "valid": false,
                "reasons": [
                  "SE_tau_0.2 distribution too degenerate"
                ]
              }
            }
          }
        }
      }
    },
    "H5": {
      "hypothesis": "H5",
      "dataset": "JBB-Paraphrased",
      "models": {
        "llama": {
          "model_name": "llama",
          "metrics": {
            "SE_tau_0.2": {
              "metric_name": "SE_tau_0.2_paraphrased",
              "distribution_analysis": {
                "metric_name": "SE_tau_0.2_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 6,
                "unique_score_ratio": 0.05217391304347826,
                "n_zeros": "94",
                "zero_proportion": 0.8173913043478261,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0,
                  1.9219280948873623
                ],
                "most_common_values": [
                  [
                    0.0,
                    94,
                    0.8173913043478261
                  ],
                  [
                    0.7219280948873623,
                    9,
                    0.0782608695652174
                  ],
                  [
                    0.9709505944546686,
                    4,
                    0.034782608695652174
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.35298058233016066,
                  "benign_mean": 0.05316499795113144,
                  "harmful_std": 0.5817600614696563,
                  "benign_std": 0.1991303582918707
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": true,
                  "mild": true,
                  "details": {
                    "unique_score_ratio": 0.05217391304347826,
                    "zero_proportion": 0.8173913043478261,
                    "effective_discrimination": false
                  }
                },
                "delong_valid": true,
                "statistical_warnings": [
                  "MODERATE DEGENERACY: DeLong test assumptions may be violated",
                  "HIGH ZERO CONCENTRATION: 81.7% of scores are exactly zero"
                ]
              },
              "auroc": 0.62318401937046,
              "delong_ci": [
                0.5545392732218892,
                0.6918287655190305
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
            },
            "BERTScore": {
              "metric_name": "BERTScore_paraphrased",
              "distribution_analysis": {
                "metric_name": "BERTScore_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 112,
                "unique_score_ratio": 0.9739130434782609,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8523883819580078,
                  1.0
                ],
                "most_common_values": [
                  [
                    1.0,
                    4,
                    0.034782608695652174
                  ],
                  [
                    0.8668166995048523,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.9065595865249634,
                    1,
                    0.008695652173913044
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.9320168771914074,
                  "benign_mean": 0.9048117833622431,
                  "harmful_std": 0.0398682158799499,
                  "benign_std": 0.025185055092450528
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.9739130434782609,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7139830508474576,
              "delong_ci": [
                0.614824739431143,
                0.8131413622637722
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance_paraphrased",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 114,
                "unique_score_ratio": 0.991304347826087,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  2.094153966210289e-16,
                  0.15877926349639893
                ],
                "most_common_values": [
                  [
                    1.0296252985950681e-15,
                    2,
                    0.017391304347826087
                  ],
                  [
                    0.09556064009666443,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.01616920530796051,
                    1,
                    0.008695652173913044
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.05030236162409507,
                  "benign_mean": 0.025103097521084344,
                  "harmful_std": 0.04382593965388317,
                  "benign_std": 0.018483155207940572
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.991304347826087,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6622276029055689,
              "delong_ci": [
                0.5561708340018241,
                0.7682843718093137
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          }
        },
        "qwen": {
          "model_name": "qwen",
          "metrics": {
            "SE_tau_0.2": {
              "metric_name": "SE_tau_0.2_paraphrased",
              "distribution_analysis": {
                "metric_name": "SE_tau_0.2_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 5,
                "unique_score_ratio": 0.043478260869565216,
                "n_zeros": "91",
                "zero_proportion": 0.7913043478260869,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0,
                  1.9219280948873623
                ],
                "most_common_values": [
                  [
                    0.0,
                    91,
                    0.7913043478260869
                  ],
                  [
                    0.7219280948873623,
                    13,
                    0.11304347826086956
                  ],
                  [
                    0.9709505944546686,
                    7,
                    0.06086956521739131
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.20716300581833766,
                  "benign_mean": 0.1875983635328971,
                  "harmful_std": 0.3640689475485539,
                  "benign_std": 0.4531407027668325
                },
                "is_degenerate": {
                  "severe": true,
                  "moderate": true,
                  "mild": true,
                  "details": {
                    "unique_score_ratio": 0.043478260869565216,
                    "zero_proportion": 0.7913043478260869,
                    "effective_discrimination": false
                  }
                },
                "delong_valid": false,
                "statistical_warnings": [
                  "SEVERE DEGENERACY: Distribution unsuitable for DeLong AUROC confidence intervals",
                  "Only 5/115 unique scores"
                ]
              },
              "auroc": 0.5351089588377724,
              "delong_ci_valid": false,
              "delong_ci_error": "Distribution too degenerate for DeLong method",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true,
              "bootstrap_warning": "Bootstrap CI may be unreliable due to degenerate distribution"
            },
            "BERTScore": {
              "metric_name": "BERTScore_paraphrased",
              "distribution_analysis": {
                "metric_name": "BERTScore_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 115,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8220413327217102,
                  0.9401082992553711
                ],
                "most_common_values": [
                  [
                    0.910913348197937,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.8719528317451477,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.876290500164032,
                    1,
                    0.008695652173913044
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.8882003352046013,
                  "benign_mean": 0.8813493100263304,
                  "harmful_std": 0.02222974623452743,
                  "benign_std": 0.01659127234827935
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6056295399515738,
              "delong_ci": [
                0.5010651801172927,
                0.710193899785855
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance_paraphrased",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance_paraphrased",
                "n_samples": 115,
                "n_unique_scores": 115,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.009067676961421967,
                  0.1345967799425125
                ],
                "most_common_values": [
                  [
                    0.05552436783909798,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.07487007975578308,
                    1,
                    0.008695652173913044
                  ],
                  [
                    0.025713246315717697,
                    1,
                    0.008695652173913044
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.05411197632617716,
                  "benign_mean": 0.0380041602272856,
                  "harmful_std": 0.024318734939215488,
                  "benign_std": 0.028665559823857354
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7021791767554479,
              "delong_ci": [
                0.6029255129027,
                0.8014328406081959
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          }
        }
      }
    },
    "H7": {
      "hypothesis": "H7",
      "dataset": "JBB-SOTA-Subset",
      "models": {
        "qwen-2.5-72b-instruct": {
          "model_name": "qwen-2.5-72b-instruct",
          "metrics": {
            "semantic_entropy": {},
            "BERTScore": {
              "metric_name": "BERTScore_SOTA",
              "distribution_analysis": {
                "metric_name": "BERTScore_SOTA",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.8404114842414856,
                  0.9617641568183899
                ],
                "most_common_values": [
                  [
                    0.8602516055107117,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.8991512060165405,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.8471435308456421,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.8991622855265935,
                  "benign_mean": 0.892102399468422,
                  "harmful_std": 0.021030124427228795,
                  "benign_std": 0.021732606266427902
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.5613888888888889,
              "delong_ci": [
                0.4569723571166875,
                0.6658054206610904
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance_SOTA",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance_SOTA",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0038171312771737576,
                  0.12665219604969025
                ],
                "most_common_values": [
                  [
                    0.06895110011100769,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.009394926019012928,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.019754689186811447,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.044811467500403525,
                  "benign_mean": 0.02894032873058071,
                  "harmful_std": 0.02129681914466809,
                  "benign_std": 0.028308453374033437
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.7325,
              "delong_ci": [
                0.6359962412032301,
                0.8290037587967702
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance_SOTA",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance_SOTA",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  136.96000000000004,
                  2980988.8400000003
                ],
                "most_common_values": [
                  [
                    338620.96,
                    1,
                    0.008333333333333333
                  ],
                  [
                    54231.88999999999,
                    1,
                    0.008333333333333333
                  ],
                  [
                    65438.45,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 299749.6785,
                  "benign_mean": 96934.67733333333,
                  "harmful_std": 650045.0314689156,
                  "benign_std": 269890.90985958074
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.5197222222222222,
              "delong_ci": [
                0.4103069920282314,
                0.629137452416213
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          }
        },
        "llama-3.3-70b-instruct": {
          "model_name": "llama-3.3-70b-instruct",
          "metrics": {
            "semantic_entropy": {},
            "BERTScore": {
              "metric_name": "BERTScore_SOTA",
              "distribution_analysis": {
                "metric_name": "BERTScore_SOTA",
                "n_samples": 120,
                "n_unique_scores": 117,
                "unique_score_ratio": 0.975,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.837664008140564,
                  0.9999998807907104
                ],
                "most_common_values": [
                  [
                    0.9999998807907104,
                    4,
                    0.03333333333333333
                  ],
                  [
                    0.886157214641571,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.9116882085800171,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.9142023642857869,
                  "benign_mean": 0.8988115400075912,
                  "harmful_std": 0.03882497953823419,
                  "benign_std": 0.027492590431643732
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.975,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.6280555555555555,
              "delong_ci": [
                0.5251687008736773,
                0.7309424102374339
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "EmbeddingVariance": {
              "metric_name": "EmbeddingVariance_SOTA",
              "distribution_analysis": {
                "metric_name": "EmbeddingVariance_SOTA",
                "n_samples": 120,
                "n_unique_scores": 120,
                "unique_score_ratio": 1.0,
                "n_zeros": "0",
                "zero_proportion": 0.0,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  1.598465520916744e-16,
                  0.21181774139404297
                ],
                "most_common_values": [
                  [
                    0.016755787655711174,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.00928562693297863,
                    1,
                    0.008333333333333333
                  ],
                  [
                    0.022033223882317543,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 0.08400401604982714,
                  "benign_mean": 0.028475395900507777,
                  "harmful_std": 0.05750047345383376,
                  "benign_std": 0.02740145527428241
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 1.0,
                    "zero_proportion": 0.0,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.8094444444444444,
              "delong_ci": [
                0.7290169254836567,
                0.8898719634052321
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            },
            "LevenshteinVariance": {
              "metric_name": "LevenshteinVariance_SOTA",
              "distribution_analysis": {
                "metric_name": "LevenshteinVariance_SOTA",
                "n_samples": 120,
                "n_unique_scores": 117,
                "unique_score_ratio": 0.975,
                "n_zeros": "4",
                "zero_proportion": 0.03333333333333333,
                "n_infinite": "0",
                "n_nan": "0",
                "score_range": [
                  0.0,
                  1634904.0
                ],
                "most_common_values": [
                  [
                    0.0,
                    4,
                    0.03333333333333333
                  ],
                  [
                    17569.69,
                    1,
                    0.008333333333333333
                  ],
                  [
                    139196.21,
                    1,
                    0.008333333333333333
                  ]
                ],
                "class_separation": {
                  "harmful_mean": 98783.42400000001,
                  "benign_mean": 109355.08133333335,
                  "harmful_std": 195366.51422851236,
                  "benign_std": 245540.5521710475
                },
                "is_degenerate": {
                  "severe": "False",
                  "moderate": "False",
                  "mild": "False",
                  "details": {
                    "unique_score_ratio": 0.975,
                    "zero_proportion": 0.03333333333333333,
                    "effective_discrimination": "True"
                  }
                },
                "delong_valid": true,
                "statistical_warnings": []
              },
              "auroc": 0.3288888888888889,
              "delong_ci": [
                0.22415025072506786,
                0.4336275270527099
              ],
              "delong_ci_valid": true,
              "delong_method": "MLstatkit",
              "bootstrap_ci": [
                NaN,
                NaN
              ],
              "bootstrap_ci_valid": true
            }
          }
        }
      },
      "cross_model_analysis": {
        "claim": "Larger models exhibit worse SE but equal/better baseline performance",
        "comparison_pairs": [],
        "statistical_evidence": {},
        "se_comparison": {
          "error": "'tau_0.3'"
        },
        "baseline_comparison": {
          "model1": "qwen-2.5-72b-instruct",
          "model2": "llama-3.3-70b-instruct",
          "metric": "BERTScore",
          "auroc1": 0.5613888888888889,
          "auroc2": 0.6280555555555555,
          "difference": 0.06666666666666654,
          "better_model": "llama-3.3-70b-instruct"
        }
      }
    }
  },
  "cross_hypothesis_comparisons": {
    "se_degeneracy_progression": {
      "hypothesis_summary": {
        "H1": {
          "llama4scout": [
            {
              "tau": "tau_0.1",
              "zero_proportion": 0.5916666666666667,
              "unique_ratio": 0.05,
              "is_severe": "False"
            },
            {
              "tau": "tau_0.2",
              "zero_proportion": 0.7666666666666667,
              "unique_ratio": 0.041666666666666664,
              "is_severe": true
            },
            {
              "tau": "tau_0.3",
              "zero_proportion": 0.8583333333333333,
              "unique_ratio": 0.025,
              "is_severe": true
            },
            {
              "tau": "tau_0.4",
              "zero_proportion": 0.9166666666666666,
              "unique_ratio": 0.025,
              "is_severe": true
            }
          ],
          "qwen25": [
            {
              "tau": "tau_0.1",
              "zero_proportion": 0.4166666666666667,
              "unique_ratio": 0.058333333333333334,
              "is_severe": "False"
            },
            {
              "tau": "tau_0.2",
              "zero_proportion": 0.85,
              "unique_ratio": 0.058333333333333334,
              "is_severe": "False"
            },
            {
              "tau": "tau_0.3",
              "zero_proportion": 0.9666666666666667,
              "unique_ratio": 0.016666666666666666,
              "is_severe": true
            },
            {
              "tau": "tau_0.4",
              "zero_proportion": 1.0,
              "unique_ratio": 0.008333333333333333,
              "is_severe": true
            }
          ]
        },
        "H2": {
          "llama-4-scout-17b-16e-instruct": [
            {
              "tau": "tau_0.1",
              "zero_proportion": 0.6604938271604939,
              "unique_ratio": 0.037037037037037035,
              "is_severe": true
            },
            {
              "tau": "tau_0.2",
              "zero_proportion": 0.8827160493827161,
              "unique_ratio": 0.024691358024691357,
              "is_severe": true
            },
            {
              "tau": "tau_0.3",
              "zero_proportion": 0.9135802469135802,
              "unique_ratio": 0.024691358024691357,
              "is_severe": true
            },
            {
              "tau": "tau_0.4",
              "zero_proportion": 0.9320987654320988,
              "unique_ratio": 0.018518518518518517,
              "is_severe": true
            }
          ],
          "qwen2.5-7b-instruct": [
            {
              "tau": "tau_0.1",
              "zero_proportion": 0.6481481481481481,
              "unique_ratio": 0.043209876543209874,
              "is_severe": true
            },
            {
              "tau": "tau_0.2",
              "zero_proportion": 0.9444444444444444,
              "unique_ratio": 0.018518518518518517,
              "is_severe": true
            },
            {
              "tau": "tau_0.3",
              "zero_proportion": 0.9876543209876543,
              "unique_ratio": 0.012345679012345678,
              "is_severe": true
            },
            {
              "tau": "tau_0.4",
              "zero_proportion": 1.0,
              "unique_ratio": 0.006172839506172839,
              "is_severe": true
            }
          ]
        },
        "H5": {},
        "H7": {
          "qwen-2.5-72b-instruct": [],
          "llama-3.3-70b-instruct": []
        }
      },
      "overall_pattern": {}
    },
    "baseline_stability": {
      "bertscore_across_hypotheses": {
        "llama4scout": {
          "H1": {
            "auroc": 0.7672222222222222,
            "fnr": 0.6
          }
        },
        "qwen25": {
          "H1": {
            "auroc": 0.615,
            "fnr": 0.8666666666666667
          }
        },
        "llama-4-scout-17b-16e-instruct": {
          "H2": {
            "auroc": 0.5057155921353451,
            "fnr": null
          }
        },
        "qwen2.5-7b-instruct": {
          "H2": {
            "auroc": 0.4311842706904435,
            "fnr": null
          }
        },
        "llama": {
          "H5": {
            "auroc": 0.7139830508474576,
            "fnr": null
          }
        },
        "qwen": {
          "H5": {
            "auroc": 0.6056295399515738,
            "fnr": null
          }
        },
        "qwen-2.5-72b-instruct": {
          "H7": {
            "auroc": 0.5613888888888889,
            "fnr": null
          }
        },
        "llama-3.3-70b-instruct": {
          "H7": {
            "auroc": 0.6280555555555555,
            "fnr": null
          }
        }
      },
      "embedding_variance_across_hypotheses": {
        "llama4scout": {
          "H1": {
            "auroc": 0.6536111111111111,
            "fnr": 0.6666666666666666
          }
        },
        "qwen25": {
          "H1": {
            "auroc": 0.7205555555555556,
            "fnr": 0.9666666666666667
          }
        },
        "llama-4-scout-17b-16e-instruct": {
          "H2": {
            "auroc": 0.6837372351775645,
            "fnr": null
          }
        },
        "qwen2.5-7b-instruct": {
          "H2": {
            "auroc": 0.7242798353909465,
            "fnr": null
          }
        },
        "llama": {
          "H5": {
            "auroc": 0.6622276029055689,
            "fnr": null
          }
        },
        "qwen": {
          "H5": {
            "auroc": 0.7021791767554479,
            "fnr": null
          }
        },
        "qwen-2.5-72b-instruct": {
          "H7": {
            "auroc": 0.7325,
            "fnr": null
          }
        },
        "llama-3.3-70b-instruct": {
          "H7": {
            "auroc": 0.8094444444444444,
            "fnr": null
          }
        }
      },
      "stability_assessment": {}
    },
    "statistical_test_validity": {
      "delong_test_validity": {
        "H1": {
          "llama4scout": {
            "valid_tests": 4,
            "total_tests": 7,
            "validity_rate": 0.5714285714285714
          },
          "qwen25": {
            "valid_tests": 5,
            "total_tests": 7,
            "validity_rate": 0.7142857142857143
          }
        },
        "H2": {
          "llama-4-scout-17b-16e-instruct": {
            "valid_tests": 3,
            "total_tests": 7,
            "validity_rate": 0.42857142857142855
          },
          "qwen2.5-7b-instruct": {
            "valid_tests": 3,
            "total_tests": 7,
            "validity_rate": 0.42857142857142855
          }
        },
        "H5": {
          "llama": {
            "valid_tests": 3,
            "total_tests": 3,
            "validity_rate": 1.0
          },
          "qwen": {
            "valid_tests": 2,
            "total_tests": 3,
            "validity_rate": 0.6666666666666666
          }
        },
        "H7": {
          "qwen-2.5-72b-instruct": {
            "valid_tests": 3,
            "total_tests": 3,
            "validity_rate": 1.0
          },
          "llama-3.3-70b-instruct": {
            "valid_tests": 3,
            "total_tests": 3,
            "validity_rate": 1.0
          }
        }
      },
      "wilson_ci_coverage": {},
      "methodological_notes": []
    }
  },
  "methodological_notes": [],
  "methodological_summary": {
    "statistical_methods_summary": {
      "confidence_intervals": {
        "wilson_ci": "Used for all FNR confidence intervals (always valid)",
        "delong_ci": "Used for AUROC confidence intervals when distributions allow",
        "bootstrap_ci": "Fallback method for AUROC when DeLong assumptions violated"
      },
      "hypothesis_tests": {
        "delong_test": "Paired comparisons of AUROC between methods",
        "mcnemar_test": "Paired comparisons of binary classification performance"
      },
      "degeneracy_handling": "Explicit detection and transparent reporting of degenerate score distributions"
    },
    "key_findings": {
      "se_degeneracy": "Semantic entropy exhibits severe score degeneracy across all hypotheses",
      "statistical_validity": "Standard AUROC tests often inappropriate for SE due to degeneracy",
      "methodological_transparency": "Degeneracy itself constitutes evidence of SE failure"
    },
    "publication_recommendations": [
      "Report Wilson CIs for all FNR comparisons (always valid)",
      "Document when DeLong AUROC CIs are inappropriate due to degeneracy",
      "Emphasize that score degeneracy strengthens the SE failure argument",
      "Use bootstrap CIs as sensitivity analysis where appropriate"
    ]
  }
}